Source code for fairdo.utils.dataset

"""
This module contains utility functions to load, preprocess, and synthesize datasets.
"""

# Standard library imports
import io
import zipfile

# Related third-party imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from requests import get

# Attempt to import (optional) sdv libraries
try:
    from sdv.single_table import GaussianCopulaSynthesizer
    from sdv.metadata import SingleTableMetadata

    sdv_installed = True
except ModuleNotFoundError:
    sdv_installed = False


[docs]def downcast(data): """ Downcast float and integer columns of the given data to save memory. Parameters ---------- data : pandas DataFrame DataFrame to downcast. Returns ------- data : pandas DataFrame Examples -------- >>> import pandas as pd >>> from fairdo.utils.dataset import downcast >>> data = pd.DataFrame({'a': [1, 2], 'b': [1.0, 2.0]}) >>> data = downcast(data) >>> print(data.dtypes) a int8 b float32 """ fcols = data.select_dtypes('float').columns icols = data.select_dtypes('integer').columns data[fcols] = data[fcols].apply(pd.to_numeric, downcast='float') data[icols] = data[icols].apply(pd.to_numeric, downcast='integer') return data
[docs]def dataset_intersectional_column(data, protected_attributes): """ Combine the protected attributes into a single column named ``intersectional_group``. This column will be used to identify the intersectional groups. Parameters ---------- data: pandas DataFrame DataFrame with protected attributes. protected_attributes : list of str List of protected attributes. Each attribute should be a column in the data. Returns ------- data : pandas DataFrame Returns a DataFrame with an extra column of combined protected attributes. protected_attribute : str The name of the column with the combined protected attributes. Examples -------- >>> import pandas as pd >>> from fairdo.utils.dataset import dataset_intersectional_column >>> data = pd.DataFrame({'sex': ['male', 'female'], 'race': ['white', 'black']}) >>> pas = ['sex', 'race'] >>> data_new, pa = dataset_intersectional_column(data, pas) >>> print(data_new) sex race pa_merged 0 male white male_white_ 1 female black female_black_ >>> print(pa) intersectional_group """ protected_attribute = 'intersectional_group' # Initialize the protected attribute column with empty strings data[protected_attribute] = '' for col in protected_attributes: data[protected_attribute] += data[col].astype(str) + '_' return data, protected_attribute
[docs]def load_data(dataset_str, multi_protected_attr=False, print_info=True): """ Load the dataset and preprocess it. The preprocessing steps include: - Dropping rows with missing values - Label encode protected attributes and label - One-hot encode all other categorical variables - Downcast float and integer columns to save memory Parameters ---------- dataset_str : str Name of the dataset to load and preprocess (e.g., 'adult', 'compas', 'bank', 'german'). multi_protected_attr : bool Whether to use multiple protected attributes or not. print_info : bool Whether to print information about the dataset or not. Returns ------- df : pandas DataFrame Preprocessed DataFrame. label : str Name of the label column. protected_attributes : list of str List of protected attributes. Examples -------- >>> from fairdo.utils.dataset import load_data >>> data, label, protected_attributes = load_data('adult') >>> print(data.head(2)) age education-num race ... relationship_ Wife sex_ Female sex_ Male 0 39 13 4 ... 0 0 1 1 50 13 4 ... 0 0 1 >>> print(label) income >>> print(protected_attributes) ['race'] """ if dataset_str == 'adult': data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]) print('Data downloaded.') # Drop columns cols_to_drop = ['fnlwgt', 'workclass', 'education', 'occupation', 'native-country'] data = data.drop(columns=cols_to_drop) # Label encoding protected_attribute and label label = 'income' if multi_protected_attr: protected_attributes = ['race', 'sex'] else: protected_attributes = ['race'] elif dataset_str == 'compas': use_cols = ['race', 'sex', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'is_violent_recid', 'v_decile_score', #'is_recid', 'priors_count', 'age_cat', 'c_charge_degree', 'two_year_recid'] data = pd.read_csv( "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv", usecols=use_cols) print('Data downloaded.') # Drop rows with missing values data = data.dropna(axis=0, how='any') # Label encoding protected_attribute and label label = 'two_year_recid' if multi_protected_attr: protected_attributes = ['race', 'sex', 'age_cat'] else: protected_attributes = ['race'] elif dataset_str == 'bank': # Loading Bank Marketing dataset r = get("http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip") z = zipfile.ZipFile(io.BytesIO(r.content)) with z.open('bank-additional/bank-additional-full.csv') as csv_file: data = pd.read_csv(csv_file, delimiter=';') print('Data downloaded.') # Drop rows with missing values data = data.dropna(axis=0, how='any') # Label encoding protected_attribute and label label = 'y' if multi_protected_attr: protected_attributes = ['job', 'marital'] else: protected_attributes = ['job'] elif dataset_str == 'german': # Loading German Credit dataset data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data", header=None, delim_whitespace=True) print('Data downloaded.') # Preprocessing steps for German Credit dataset # ... pass # Define label and protected attributes for German Credit dataset label = 'credit_risk' protected_attributes = ['age', 'gender'] raise NotImplementedError else: raise NotImplementedError # Label encoding protected_attribute and label cols_to_labelencode = protected_attributes.copy() cols_to_labelencode.append(label) data[cols_to_labelencode] = \ data[cols_to_labelencode].apply(LabelEncoder().fit_transform) # Encode categorical variables as one-hot categorical_cols = list(data.select_dtypes(include='object').columns) data = pd.get_dummies(data, columns=categorical_cols) # Downcast data = downcast(data) # print info of the data if print_info: print(data[protected_attributes].iloc[:, 0].unique()) print(data[protected_attributes].iloc[:, 0].value_counts()) print(data.shape) return data, label, protected_attributes
[docs]def generate_data(data, num_rows=100): """ Generate synthetic data using the sdv library. The method used is Gaussian Copula. Parameters ---------- data : pandas DataFrame The real data to be used to generate synthetic data num_rows : int The number of rows to generate Returns ------- synthetic_data : pandas DataFrame or None The synthetic data generated Examples -------- >>> import pandas as pd >>> from fairdo.utils.dataset import generate_data >>> data = pd.DataFrame({'age': [39, 50], 'education': ['Bachelors', 'HS-grad'], 'income': ['<=50K', '<=50K']}) >>> generate_data(data, num_rows=2) age education income 0 39.0 Bachelors <=50K 1 50.0 HS-grad <=50K """ if not sdv_installed: # Inform the user that sdv library is required print("The 'sdv' library is required to generate synthetic data.") print("Please install it by running: pip install sdv==1.10.0") return None # Fit the synthesizer to the real data metadata = SingleTableMetadata() metadata.detect_from_dataframe(data) synthesizer = GaussianCopulaSynthesizer(metadata) synthesizer.fit(data) synthetic_data = synthesizer.sample(num_rows=num_rows) return synthetic_data
[docs]def data_generator(data): """ Returns the data generator, from which the user can generate synthetic data Parameters ---------- data : pandas DataFrame The real data to be used to generate synthetic data Returns ------- synthesizer : GaussianCopulaSynthesizer object or None The data generator object Examples -------- >>> import pandas as pd >>> from fairdo.utils.dataset import data_generator >>> data = pd.DataFrame({'age': [39, 50], 'education': ['Bachelors', 'HS-grad'], 'income': ['<=50K', '<=50K']}) >>> synthesizer = data_generator(data) >>> synthetic_data = synthesizer.sample(num_rows=2) >>> print(synthetic_data) age education income 0 39.0 Bachelors <=50K 1 50.0 HS-grad <=50K """ if not sdv_installed: # Inform the user that sdv library is required print("The 'sdv' library is required to generate synthetic data.") print("Please install it by running: pip install sdv==1.10.0") return None # Fit the synthesizer to the real data metadata = SingleTableMetadata() metadata.detect_from_dataframe(data) synthesizer = GaussianCopulaSynthesizer(metadata) synthesizer.fit(data) return synthesizer