Source code for fairdo.metrics.group

import numpy as np
import warnings
from itertools import product

from fairdo.utils.helper import generate_pairs


[docs]def statistical_parity_abs_diff_multi(y: np.array, z: np.array,
                                      agg_attribute=np.max,
                                      agg_group=np.max,
                                      positive_label=1,
                                      **kwargs) -> float:
    """
    Calculate the absolute difference in statistical parity for multiple non-binary protected attributes.
    Protected attributes `z[i]` can be binary or non-binary.

    Parameters
    ----------
    y: np.array
        Flattened binary array of shape (n_samples,), can be the prediction or the truth label.
    z: np.array
        Array of shape (n_samples, n_protected_attributes) representing the protected attribute.
    agg_attribute: callable, optional
        Aggregation function for the attribute. Default is np.sum.
    agg_group: callable, optional
        Aggregation function for the group. Default is np.sum.
    positive_label: int, optional
        Label considered as positive. Default is 1.

    Returns
    -------
    float
        Aggregated attribute disparity.
    """
    # check input
    if len(z.shape) < 2:
        z = z.reshape(-1, 1)
    # invert privileged and positive label if required
    if positive_label == 0:
        y = 1 - y

    y = y.astype(int)
    # z = z.astype(int)

    # get unique values for each attribute
    groups = [np.unique(z[:, i]) for i in range(z.shape[1])]
    # print(groups)
    # get statistical parity for each attribute
    attributes_disparity = []
    for k, zk in enumerate(groups):
        # calculate statistical parities for all groups in one pass
        parities = {i: np.sum(y & (z[:, k] == i)) / np.sum(z[:, k] == i) for i in zk}
        # parities = {i: np.mean(y[z[:, k] == i]) for i in zk} # slower than the above method
        # generate all possible pairs of values for the attribute
        if agg_group == np.max:
            group_disparity = np.max(list(parities.values())) - np.min(list(parities.values()))
            attributes_disparity.append(group_disparity)
        else:
            pairs = generate_pairs(zk)
            group_disparity = [np.abs(parities[i] - parities[j]) for i, j in pairs]

            try:
                attributes_disparity.append(agg_group(group_disparity))
            except ValueError:
                warnings.warn(f"Could not aggregate disparity for attribute {k} with aggregation function {agg_group}. "
                            f"The disparity for this attribute is {group_disparity}. "
                            f"Returning disparity of 0.")
                attributes_disparity.append(0)
    return agg_attribute(attributes_disparity)


[docs]def statistical_parity_abs_diff_intersectionality(y: np.array, z: np.array,
                                                  agg_group=np.max,
                                                  **kwargs) -> float:
    """
    Calculate the absolute difference in statistical parity for multiple non-binary protected attributes.
    Intersections from all protected attributes are considered.
    Protected attributes `z[i]` can be binary or non-binary.

    Parameters
    ----------
    y: np.array
        Flattened binary array of shape (n_samples,), can be the prediction or the truth label.
    z: np.array
        Array of shape (n_samples, n_protected_attributes) representing the protected attribute.
    agg_group: callable, optional
        Aggregation function for the group. Default is np.sum.
    **kwargs: dict
        Additional keyword arguments.
    """
    z_subgroups = np.apply_along_axis(lambda x: ''.join(map(str, x)), axis=1, arr=z)
    all_subgroups = list(set(z_subgroups))
    parities = {i: np.sum(y & (z_subgroups == i)) / np.sum(z_subgroups == i) for i in all_subgroups}
    
    if agg_group == np.max:
        group_disparity = np.max(list(parities.values())) - np.min(list(parities.values()))
    else:
        pairs = generate_pairs(list(all_subgroups))
        group_disparity = [np.abs(parities[i] - parities[j]) for i, j in pairs]

    return agg_group(group_disparity)


[docs]def statistical_parity_abs_diff(y: np.array, z: np.array, agg_group=np.sum, **kwargs) -> float:
    """
    Calculate the absolute value of the statistical parity difference between all groups inside a protected attribute.
    The protected attribute `z` can be binary or non-binary.
    Returned value is aggregated with `agg_group`.

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened array of shape y, represents the protected attribute.
        Can represent non-binary protected attribute.
    agg_group: callable, optional
        Aggregation function for the group. Default is np.sum.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        The absolute value of the statistical parity difference.
    """
    if z.ndim > 1:
        raise ValueError("z must be a 1D array")
    return statistical_parity_abs_diff_multi(y=y, z=z, agg_group=agg_group, **kwargs)


[docs]def statistical_parity_abs_diff_sum(y: np.array, z: np.array,
                                    **kwargs) -> float:
    """
    Calculate the maximum of statistical parity absolute differences between all groups in a protected attribute.
    The protected attribute `z` can be binary or non-binary.

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened array of shape y, represents the protected attribute.
        Can represent non-binary protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        Average of the absolute value of the statistical parity differences between all groups.
    """
    return statistical_parity_abs_diff(y=y, z=z, agg_group=np.sum, **kwargs)


[docs]def statistical_parity_abs_diff_mean(y: np.array, z: np.array,
                                     **kwargs) -> float:
    """
    Calculate the sum of statistical parity absolute differences between all groups and return the average score.
    The protected attribute `z` can be binary or non-binary.

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened array of shape y, represents the protected attribute.
        Can represent non-binary protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        Average of the absolute value of the statistical parity differences between all groups.
    """
    return statistical_parity_abs_diff(y=y, z=z, agg_group=np.mean, **kwargs)


[docs]def statistical_parity_abs_diff_max(y: np.array, z: np.array,
                                    **kwargs) -> float:
    """
    Calculate the maximum of statistical parity absolute differences between all groups in a protected attribute.
    The protected attribute `z` can be binary or non-binary.

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened array of shape y, represents the protected attribute.
        Can represent non-binary protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        Average of the absolute value of the statistical parity differences between all groups.
    """
    return statistical_parity_abs_diff(y=y, z=z, agg_group=np.max, **kwargs)


[docs]def statistical_parity_difference(y: np.array, z: np.array,
                                  positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Calculate the difference in statistical parity according to [1].
    The protected attribute `z` must be binary. Returned value can be negative.

    [1] A Maximal Correlation Framework for Fair Machine Learning (Lee et al. 2022) (https://arxiv.org/abs/2106.00051)

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened binary array of shape y, represents the protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        The difference in statistical parity between unprivileged and privileged groups.
    """
    if z.ndim > 1:
        raise ValueError("z must be a 1D array")

    # invert privileged and positive label if required
    if privileged_group == 0:
        z = 1 - z
    if positive_label == 0:
        y = 1 - y

    y = y.astype(int)
    z = z.astype(int)
    priv = np.sum(y & z) / np.sum(z)
    unpriv = np.sum(y & (1 - z)) / np.sum(1 - z)

    return unpriv - priv


[docs]def mean_difference(*args, **kwargs) -> float:
    """
    Alias for the `statistical_parity_difference` function.

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened binary array of shape y, represents the protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        The difference in statistical parity between unprivileged and privileged groups.
    """
    return statistical_parity_difference(*args, **kwargs)


[docs]def disparate_impact_ratio(y: np.array, z: np.array,
                           positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Calculate the Disparate Impact ratio.
    The protected attribute `z` must be binary.

    This function computes the ratio of probabilities of positive outcomes for
    the unprivileged group to the privileged group. A value of 1 indicates
    fairness, while a value < 1 indicates discrimination towards the
    unprivileged group. A value of > 1 would indicate discrimination towards
    the privileged group.

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened binary array of shape y, represents the protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        The Disparate Impact ratio.
    """
    # invert privileged and positive label if required
    if privileged_group == 0:
        z = 1 - z
    if positive_label == 0:
        y = 1 - y

    y = y.astype(int)
    z = z.astype(int)
    priv = np.sum(y & z) / np.sum(z)
    unpriv = np.sum(y & (1 - z)) / np.sum(1 - z)

    if priv == 0:
        warnings.warn("Disparate impact cannot be calculated. y=1 and z=1 are not apparent in the dataset.")
        warnings.warn("Return 1 (fair).")

        return 1

    return unpriv / priv


[docs]def disparate_impact_ratio_objective(y: np.array, z: np.array,
                                     positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Calculate the objective Disparate Impact ratio.
    The protected attribute `z` must be binary.

    This function computes the absolute difference between 1 and the Disparate
    Impact ratio. It can be used as an objective function to minimize
    discrimination towards the unprivileged group (and the privileged group).
    Lower values indicate less discrimination.

    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened binary array of shape y, represents the protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        The objective Disparate Impact ratio.
    """
    return np.abs(1 - disparate_impact_ratio(y, z, positive_label, privileged_group, **kwargs))


[docs]def disparate_impact_ratio_deviation(y: np.array, z: np.array,
                                     positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Calculate the difference in objective Disparate Impact ratio.
    The protected attribute `z` must be binary.

    This function computes the difference between 1 and the Disparate Impact
    ratio. A value of 0 indicates fairness. A positive value indicates
    discrimination towards the unprivileged group. A negative value indicates
    discrimination towards the privileged group.


    Parameters
    ----------
    y: np.array
        Flattened binary array, can be the prediction or the truth label.
    z: np.array
        Flattened binary array of shape y, represents the protected attribute.
    positive_label: int, optional
        Label considered as positive. Default is 1.
    privileged_group: int, optional
        Label considered as privileged. Default is 1.

    Returns
    -------
    float
        The difference in objective Disparate Impact ratio.
    """
    return 1 - disparate_impact_ratio(y, z, positive_label, privileged_group, **kwargs)


[docs]def equal_opportunity_difference(y_true: np.array, y_pred: np.array, z: np.array,
                                 positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Compute the difference in Equality of Opportunity [1] between
    the privileged group and the unprivileged group.

    Equality of Opportunity [1] is a fairness metric
    that measures the difference in true positive rates between the privileged and unprivileged groups.
    This function returns a float representing that difference.
    A value of 0 indicates perfect fairness, positive values indicate bias
    against the unprivileged group, while negative values indicate
    bias against the privileged group.

    [1] Equality of Opportunity (Hardt, Price, Srebro, 2016)](https://arxiv.org/abs/1610.02413)

    Parameters
    ----------
    y_true: numpy.array
        The true binary labels as a flattened array.
    y_pred: numpy.array
        The predicted binary labels from the model.
        Should be of the same shape as y_true.
    z: numpy.array
        The protected attribute as a binary array.
        This array indicates the group (privileged or unprivileged) for each instance in the data.
        Should be of the same shape as y_true.
    positive_label: int, optional (default=1)
        The label considered as positive in the dataset.
    privileged_group: int, optional (default=1)
        The label that denotes the privileged group.
        If 0, the function will treat the unprivileged group as the privileged group.

    Returns
    -------
    float
        The difference in Equality of Opportunity between the privileged and unprivileged groups.
    """
    # invert privileged and positive label if required
    if privileged_group == 0:
        z = 1 - z
    if positive_label == 0:
        y_true = 1 - y_true
        y_pred = 1 - y_pred

    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    z = z.astype(int)
    priv_eo = np.sum(y_pred & y_true & z) / np.sum(y_true & z)
    unpriv_eo = np.sum(y_pred & y_true & (1 - z)) / np.sum(y_true & (1 - z))

    return priv_eo - unpriv_eo


[docs]def equal_opportunity_abs_diff(*args, **kwargs):
    """
    Compute the absolute difference in Equality of Opportunity [1].

    [1] Equality of Opportunity (Hardt, Price, Srebro, 2016) (https://arxiv.org/abs/1610.02413)

    Parameters
    ----------
    *args: arguments
        Variable length argument list to be passed to `equal_opportunity_difference` function.
    **kwargs: keyword arguments
        Arbitrary keyword arguments to be passed to `equal_opportunity_difference` function.

    Returns
    -------
    float
        The absolute difference in Equality of Opportunity between privileged and unprivileged groups.
    """
    return np.abs(equal_opportunity_difference(*args, **kwargs))


[docs]def predictive_equality_difference(y_true: np.array, y_pred: np.array, z: np.array,
                                   positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Calculate the difference in Predictive Equality.

    Parameters
    ----------
    y_true: numpy.array
        True binary labels as a flattened array.
    y_pred: numpy.array
        Predicted binary labels as a flattened array. Must have same shape as y_true.
    z: numpy.array
        Binary array denoting privileged (1) or unprivileged (0) group. Same shape as y_true.
    positive_label: int, optional
        Label considered as positive, default is 1.
    privileged_group: int, optional
        Label representing the privileged group, default is 1.

    Returns
    -------
    float
        The difference in Predictive Equality between privileged and unprivileged groups.

    """
    # invert privileged and positive label if required
    if privileged_group == 0:
        z = 1 - z
    if positive_label == 0:
        y_true = 1 - y_true
        y_pred = 1 - y_pred

    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    z = z.astype(int)
    priv_eo = np.sum(y_pred & (1-y_true) & z) / np.sum((1-y_true) & z)
    unpriv_eo = np.sum(y_pred & (1-y_true) & (1-z)) / np.sum((1-y_true) & (1-z))

    return priv_eo - unpriv_eo


[docs]def predictive_equality_abs_diff(*args, **kwargs):
    """
    Compute the absolute difference in Predictive Equality.

    Parameters
    ----------
    *args: arguments
        Variable length argument list to be passed to `predictive_equality_difference` function.
    **kwargs: keyword arguments
        Arbitrary keyword arguments to be passed to `predictive_equality_difference` function.

    Returns
    -------
    float
        The absolute difference in Predictive Equality between privileged and unprivileged groups.
    """
    return np.abs(predictive_equality_difference(*args, **kwargs))


[docs]def average_odds_difference(y_true: np.array, y_pred: np.array, z: np.array,
                            positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Calculate the difference in Average Odds between privileged and unprivileged groups.

    [1] Equality of Opportunity in Supervised Learning (Hardt, Price, Srebro, 2016) (https://arxiv.org/abs/1610.02413)

    Parameters
    ----------
    y_true: numpy.array
        Flattened array of true binary labels.
    y_pred: numpy.array
        Flattened array of predicted binary labels. Must have same shape as y_true.
    z: numpy.array
        Binary array indicating privileged (1) or unprivileged (0) group. Same shape as y_true.
    positive_label: int, optional
        Label considered as positive, default is 1.
    privileged_group: int, optional
        Label denoting the privileged group, default is 1.

    Returns
    -------
    float
        The difference in Average Odds between privileged and unprivileged groups.
    """
    # invert privileged and positive label if required
    if privileged_group == 0:
        z = 1 - z
    if positive_label == 0:
        y_true = 1 - y_true
        y_pred = 1 - y_pred

    eod = equal_opportunity_difference(y_true, y_pred, z, positive_label, privileged_group)
    ped = predictive_equality_difference(y_true, y_pred, z, positive_label, privileged_group)

    return (eod + ped)/2


[docs]def average_odds_error(y_true: np.array, y_pred: np.array, z: np.array,
                       positive_label=1, privileged_group=1, **kwargs) -> float:
    """
    Compute the Average Odds Error.
    Can be used as an objective function to minimize.

    Parameters
    ----------
    y_true: numpy.array
        Flattened array of true binary labels.
    y_pred: numpy.array
        Flattened array of predicted binary labels. Must have same shape as y_true.
    z: numpy.array
        Binary array indicating privileged (1) or unprivileged (0) group. Same shape as y_true.
    positive_label: int, optional
        Label considered as positive, default is 1.
    privileged_group: int, optional
        Label denoting the privileged group, default is 1.

    Returns
    -------
    float
        The Average Odds Error between privileged and unprivileged groups.
    """
    # invert privileged and positive label if required
    if privileged_group == 0:
        z = 1 - z
    if positive_label == 0:
        y_true = 1 - y_true
        y_pred = 1 - y_pred

    eod = equal_opportunity_abs_diff(y_true, y_pred, z, positive_label, privileged_group)
    ped = predictive_equality_abs_diff(y_true, y_pred, z, positive_label, privileged_group)

    return (eod + ped)/2