Source code for fairdo.preprocessing.base

import abc

import numpy as np
import pandas as pd


[docs]class Preprocessing(metaclass=abc.ABCMeta): """ Base class for all preprocessing methods. Parameters ---------- protected_attribute: str label: str predicting label dataset: pandas DataFrame original dataset transformed_data: pandas DataFrame dataset after transformation/pre-processing """ def __init__(self, protected_attribute, label): """ Base class for all preprocessing methods. Parameters ---------- protected_attribute: str label: str predicting label """ self.protected_attribute = protected_attribute self.label = label self.dataset = None self.transformed_data = None
[docs] @abc.abstractmethod def transform(self): return self
[docs] def fit(self, dataset): """ Copies the dataset to the class and checks if the dataset is valid, i.e., all columns are numeric. Parameters ---------- dataset: pandas DataFrame Returns ------- self """ self.dataset = dataset.copy() self._check_valid_datatype() return self
[docs] def fit_transform(self, *args, **kwargs): """ Fit the model to the dataset and transform the dataset. Parameters ---------- args: list Positional arguments for the `fit` method. kwargs: dict Keyword arguments for the `fit` method. Returns ------- self """ return self.fit(*args, **kwargs).transform()
def _check_valid_datatype(self): if not isinstance(self.dataset, pd.DataFrame): try: self.dataset = self.dataset.convert_to_dataframe()[0] except: raise Exception('Type of dataset is unknown.') # check if all columns are numeric (including boolean) is_number = np.vectorize(lambda x: np.issubdtype(x, np.number) or np.issubdtype(x, bool)) if not np.all(is_number(self.dataset.dtypes)): raise Exception(f"All columns must be numeric. The datatypes of the columns are:\n{self.dataset.dtypes}")
[docs]class OriginalData(Preprocessing): """ This class is used to return the original dataset. """ def __init__(self, **kwargs): self.dataset = None self.transformed_data = None
[docs] def transform(self): """ Returns the original dataset. Returns ------- pandas DataFrame The original dataset. """ self.transformed_data = self.dataset return self.dataset
[docs]class Unawareness(Preprocessing): """ Fairness Through Unawareness Removes all columns of protected attributes from the dataset. Is a simple and effective method to ensure fairness in the dataset. But fails to consider the possibility of indirect discrimination, i.e., the protected attribute may be correlated with other features in the dataset. """ def __init__(self, protected_attribute=None, label=None, **kwargs): super().__init__(protected_attribute=protected_attribute, label=label)
[docs] def transform(self): """ Removes all protected attributes from the dataset. Returns ------- pandas DataFrame The dataset without the protected attribute. """ if self.dataset is None: raise Exception('Model not fitted. Run the `fit` method first.') if self.protected_attribute is None or self.protected_attribute not in self.dataset.columns: raise Exception('Protected attribute not given.') self.transformed_data = self.dataset.drop(columns=self.protected_attribute) return self.transformed_data
[docs]class Random(Preprocessing): """ This class is used to return a random subset of the dataset. The size of the subset is determined by the `frac` parameter. """ def __init__(self, frac=0.8, protected_attribute=None, label=None, random_state=None): super().__init__(protected_attribute=protected_attribute, label=label) self.frac = frac self.random_state = random_state np.random.seed(self.random_state)
[docs] def transform(self): """ Returns a random subset of the dataset. A fraction `frac` of the dataset is returned. Returns ------- pandas DataFrame The random subset of the dataset. """ if self.dataset is None: raise Exception('Model not fitted. Run the `fit` method first.') self.transformed_data = self.dataset.sample(frac=self.frac, axis=0, random_state=self.random_state) return self.transformed_data