Source code for fairdo.preprocessing.base
import abc
import numpy as np
import pandas as pd
[docs]class Preprocessing(metaclass=abc.ABCMeta):
"""
Base class for all preprocessing methods.
Parameters
----------
protected_attribute: str
label: str
predicting label
dataset: pandas DataFrame
original dataset
transformed_data: pandas DataFrame
dataset after transformation/pre-processing
"""
def __init__(self, protected_attribute, label):
"""
Base class for all preprocessing methods.
Parameters
----------
protected_attribute: str
label: str
predicting label
"""
self.protected_attribute = protected_attribute
self.label = label
self.dataset = None
self.transformed_data = None
[docs] def fit(self, dataset):
"""
Copies the dataset to the class and checks if the dataset is valid, i.e., all columns are numeric.
Parameters
----------
dataset: pandas DataFrame
Returns
-------
self
"""
self.dataset = dataset.copy()
self._check_valid_datatype()
return self
def _check_valid_datatype(self):
if not isinstance(self.dataset, pd.DataFrame):
try:
self.dataset = self.dataset.convert_to_dataframe()[0]
except:
raise Exception('Type of dataset is unknown.')
# check if all columns are numeric (including boolean)
is_number = np.vectorize(lambda x: np.issubdtype(x, np.number) or np.issubdtype(x, bool))
if not np.all(is_number(self.dataset.dtypes)):
raise Exception(f"All columns must be numeric. The datatypes of the columns are:\n{self.dataset.dtypes}")
[docs]class OriginalData(Preprocessing):
"""
This class is used to return the original dataset.
"""
def __init__(self, **kwargs):
self.dataset = None
self.transformed_data = None
[docs]class Unawareness(Preprocessing):
"""
Fairness Through Unawareness
Removes all columns of protected attributes from the dataset.
Is a simple and effective method to ensure fairness in the dataset.
But fails to consider the possibility of indirect discrimination, i.e.,
the protected attribute may be correlated with other features in the dataset.
"""
def __init__(self, protected_attribute=None, label=None, **kwargs):
super().__init__(protected_attribute=protected_attribute, label=label)
[docs]class Random(Preprocessing):
"""
This class is used to return a random subset of the dataset.
The size of the subset is determined by the `frac` parameter.
"""
def __init__(self, frac=0.8,
protected_attribute=None, label=None, random_state=None):
super().__init__(protected_attribute=protected_attribute, label=label)
self.frac = frac
self.random_state = random_state
np.random.seed(self.random_state)