Source code for fairdo.metrics.dependence

"""
This module contains functions to calculate the dependency, correlation, association or any other relationship
between two variables.

In the fairness context, the dependency between the target variable and the protected attribute(s) is of interest.
Let :math:`y` be the target variable and :math:`z` be the protected attribute(s), then some kind of relationship between
these two variables is calculated using a function :math:`f`: :math:`f(y, z)`.
"""

import numpy as np
from scipy.stats import rankdata
from sklearn.metrics import mutual_info_score, normalized_mutual_info_score

# fairdo
from fairdo.utils.math import entropy_estimate_cat, joint_entropy_cat, conditional_entropy_cat

import warnings


[docs]def dependency_multi(y: np.array, z: np.array, dependency_function=normalized_mutual_info_score, agg=np.max, positive_label=1, **kwargs) -> float: """ Calculates the dependency between ``y`` with each ``z[:,i]`` using the specified ``dependency_function``. Aggregates the dependency scores using the ``agg`` function. Let :math:`f` be the dependency function, :math:`y` be the target variable, and :math:`z` be the protected attributes, then the dependency score is calculated as (pythonic notation): .. math:: \\text{dependency}(y, z) = \\text{agg}(f(y, z[:,0]), f(y, z[:,1]), \\ldots, f(y, z[:,-1])) Parameters ---------- y : np.array, shape (n_samples,) Flattened binary array of shape (n_samples,), can be a prediction or the truth label. z : np.array, shape (n_samples, n_protected_attributes) Array of shape (n_samples, n_protected_attributes), represents the protected attributes. dependency_function : callable, optional Function to compute the dependency between `y` and each protected attribute. Default is normalized_mutual_info_score. agg : callable, optional Aggregation function to combine the dependency scores. Default is np.max. positive_label : int, optional Label considered as positive. Default is 1. **kwargs Additional keyword arguments. These are not currently used. Returns ------- float The aggregated dependency score. Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import dependency_multi >>> y = np.random.randint(0, 2, (10,)) >>> z = np.random.randint(0, 2, (10, 3)) >>> dependency_multi(y, z) 0.12634639359704877 """ # check input if len(z.shape) == 1: z = z.reshape(-1, 1) if len(z.shape) > 2: raise ValueError('z must be a 2D or 1D array') # invert privileged and positive label if required if positive_label == 0: y = 1 - y y = y.astype(int) z = z.astype(int) # get normalized mutual information for each attribute scores = [] for i in range(z.shape[1]): scores.append(dependency_function(y, z[:, i])) return agg(scores)
[docs]def nmi_multi(y: np.array, z: np.array, agg=np.max, positive_label=1, **kwargs): """ Compute the normalized mutual information [11]_ for multiple non-binary protected attributes. This function calculates the normalized mutual information between ``y`` and each protected attribute in ``z``, and then aggregates these scores using the specified ``agg`` function. Let :math:`y` be the target variable and :math:`z` be the protected attributes, then the normalized mutual information score for multiple protected attributes is calculated as (pythonic notation): .. math:: \\text{NMI}(y, z) = \\text{agg}(\\text{NMI}(y, z[:,0]), \\text{NMI}(y, z[:,1]), \\ldots, \\text{NMI}(y, z[:,-1])) Parameters ---------- y : np.array, shape (n_samples,) Flattened binary array, can be a prediction or the truth label. z : np.array, shape (n_samples, n_protected_attributes) Each ``z[:,i]`` represents a single protected attribute. agg : callable, optional Aggregation function to combine the normalized mutual information scores. Default is np.max. positive_label : int, optional Label considered as positive. Default is 1. **kwargs Additional keyword arguments. These are not currently used. Returns ------- float The aggregated normalized mutual information score. References ---------- .. [11] Strehl, A., Ghosh, J., & Mooney, R. J. (2002). Impact of similarity measures on web-page clustering. In Workshop on The WebKDD (pp. 58-64). Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import nmi_multi >>> y = np.random.randint(0, 2, (10,)) >>> z = np.random.randint(0, 2, (10, 3)) >>> nmi_multi(y, z) 0.09855890449799566 """ return dependency_multi(y, z, dependency_function=normalized_mutual_info_score, agg=agg, positive_label=positive_label)
[docs]def nmi(y: np.array, z: np.array, **kwargs) -> float: """ Calculate the normalized mutual information between two arrays. The protected attribute `z` can be binary or non-binary. Normalized mutual information is a normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual information, independent variables) and 1 (perfect correlation). The function handles any warning by ignoring them. The formula is given by: .. math:: \\text{NMI}(Y, Z) = \\frac{2 \\cdot I(Y, Z)}{H(Y) + H(Z)} where :math:`I(Y, Z)` is the mutual information between `Y` and `Z`, and :math:`H(Y)` and :math:`H(Z)` are the entropies of :math:`Y` and :math:`Z`, respectively. Parameters ---------- y : np.array, shape (n_samples,) Flattened array, can be a prediction or the truth label. Discrete values. z : np.array, shape (n_samples,) Flattened array of the same shape as y. Discrete values. **kwargs Additional keyword arguments. These are not currently used. Returns ------- float The normalized mutual information between y and z. Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import nmi >>> y = np.array([0, 1, 1, 0, 1, 0, 0, 1]) >>> z = np.array([0, 1, 1, 0, 1, 0, 0, 1]) >>> nmi(y, z) 1.0 """ with warnings.catch_warnings(): warnings.simplefilter("ignore") return normalized_mutual_info_score(y, z)
[docs]def mi(y: np.array, z: np.array, bins=2, **kwargs) -> float: """ Calculate the mutual information [12]_ between two arrays. The protected attribute ``z`` can be binary or non-binary. Mutual information is a measure of the mutual dependence between two variables. It quantifies the "amount of information" (in units such as bits) obtained about one random variable, by observing the other random variable. Higher values indicate a higher dependency between the two variables. It is defined as: .. math:: I(Y, Z) = \\sum_{Y, Z} p(Y, Z) \\log \\left(\\frac{p(Y, Z)}{p(Y) \\cdot p(Z)}\\right) where :math:`p(Y, Z)` is the joint probability distribution of :math:`Y` and :math:`Z`, and :math:`p(Y)` and :math:`p(Z)` are the respective marginal probability distributions. Parameters ---------- y : np.array (n_samples,) Flattened array, can be a prediction or the truth label. Discrete values. z : np.array (n_samples,) Flattened array of the same shape as y. Discrete values. bins : int, optional Number of bins for discretization. Default is 2. **kwargs Additional keyword arguments. These are not currently used. Returns ------- float The mutual information between y and z. References ---------- .. [12] Cover, T. M., & Thomas, J. A. (2006). Elements of information theory. John Wiley & Sons. Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import mi >>> y = np.random.randint(0, 2, (10,)) >>> z = np.random.randint(0, 2, (10,)) >>> mi(y, z) 0.013844293808390806 """ mi_value = mutual_info_score(y, z) return mi_value
[docs]def total_correlation(*arrays) -> float: """Calculate the total correlation (multi-information) of multiple categorical variables [13]_ [4]_. Given a set of :math:`m` categorical variables :math:`X = (X_1, X_2, \\ldots, X_m)`, the total correlation is: .. math:: TC(X) = TC(X_1, X_2, \\ldots, X_m) = \\left(\\sum_{i=1}^{m} H(X_i)\\right) - H(X_1, X_2, \\ldots, X_m) where :math:`H(X_i)` is the entropy of the i-th variable and :math:`H(X_1, X_2, \\ldots, X_m)` is the joint entropy. Parameters ---------- *arrays: np.array Arrays of shape (n_samples,) containing the labels. Returns ------- float The total correlation of the categorical variables. References ---------- .. [13] Watanabe, S. (1960). Information theoretical analysis of multivariate correlation. IBM Journal of Research and Development, 4(1), 66-82. .. [4] Garner, W. R. (1962). Uncertainty and Structure as Psychological Concepts, JohnWiley & Sons, New York Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import total_correlation >>> x = np.array([0, 1, 1, 0, 1, 0, 0, 1]) >>> y = 1 - x >>> total_correlation(x, y) 1.0 """ # Calculate sum of individual entropies try: sum_individual_entropies = sum([entropy_estimate_cat(arr) for arr in arrays]) except: warnings.warn("Calculate total correlation along axis=1.") sum_individual_entropies = 0 for i in range(len(arrays)): if len(arrays[i].shape) == 1: sum_individual_entropies += entropy_estimate_cat(arrays[i]) elif len(arrays[i].shape) == 2: sum_individual_entropies += sum( entropy_estimate_cat(arrays[i][:, j]) for j in range(arrays[i].shape[1])) else: raise Exception(f"{i}-th Parameter is not 1d or 2d.") # Calculate joint entropy joint_entropy = joint_entropy_cat(np.column_stack(arrays)) # Total correlation is the sum of individual entropies minus the joint entropy tc = sum_individual_entropies - joint_entropy return tc
[docs]def dual_total_correlation(*arrays): """Calculate the dual total correlation [5]_ for more than two variables. Given a set of :math:`m` categorical variables :math:`X = (X_1, X_2, \\ldots, X_m)`, it is given by: .. math:: DTC(X) = DTC(X_1, X_2, \\ldots, X_m) = (X_1, X_2, \\ldots, X_m) - \\sum_{i=1}^{m} H(X_i | X_1, X_2, \\ldots, X_{i-1}, X_{i+1}, \\ldots, X_m) where :math:`H(X_1, X_2, \\ldots, X_m)` is the joint entropy and :math:`H(X_i | X_1, X_2, \\ldots, X_{i-1}, X_{i+1}, \\ldots, X_m)` is the conditional entropy of :math:`X_i` given all other variables. Parameters ---------- *arrays: np.array Arrays of shape (n_samples,) containing the labels. Returns ------- float The dual total correlation of the categorical variables. References ---------- .. [5] Han, Te Sun. (1978). Nonnegative entropy measures of multivariate symmetric correlations. Information and Control. 36 (2): 133–156. Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import dual_total_correlation >>> x = np.array([0, 1, 1, 0, 1, 0, 0, 1]) >>> y = 1 - x >>> dual_total_correlation(x, y) 1.0 """ n = len(arrays) if n < 2: raise ValueError("Need at least two variables to calculate dual total correlation.") xs = np.column_stack(arrays) # Calculate joint entropy joint_entropy = joint_entropy_cat(xs) # Calculate conditional entropy conditional_entropy = 0 for i in range(xs.shape[1]): H_i = conditional_entropy_cat(xs[:, i], xs[:, np.arange(n) != i]) conditional_entropy += H_i dtc = joint_entropy - conditional_entropy return dtc
[docs]def o_information(*arrays): """ Calculate the O-information [6]_ of multiple categorical variables. The O-information is the difference between the total correlation and the dual total correlation: .. math:: O(X_1, X_2, \\ldots, X_m) = TC(X_1, X_2, \\ldots, X_m) - DTC(X_1, X_2, \\ldots, X_m) where :math:`TC` is the total correlation and :math:`DTC` is the dual total correlation. Parameters ---------- *arrays : np.array Arrays of shape (n_samples,) containing the labels. Returns ------- float The O-information of the categorical variables. References ---------- .. [6] Han, Te Sun. (1978). Nonnegative entropy measures of multivariate symmetric correlations. Information and Control. 36 (2): 133–156. Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import o_information >>> x = np.array([0, 1, 1, 0, 1, 0, 0, 1]) >>> y = 1 - x >>> o_information(x, y) 0.0 """ return total_correlation(*arrays) - dual_total_correlation(*arrays)
[docs]def pearsonr(y: np.array, z: np.array, **kwargs) -> float: """ Calculate the Pearson correlation coefficient between two arrays. The protected attribute `z` can be binary or non-binary. It is given by: .. math:: \\text{Pearson}(Y, Z) = \\frac{\\text{cov}(Y, Z)}{\\sigma_Y \\cdot \\sigma_Z} where :math:`\\text{cov}(Y, Z)` is the covariance between :math:`Y` and :math:`Z`, and :math:`\\sigma_Y` and :math:`\\sigma_Z` are the respective standard deviations. Parameters ---------- y : np.array Flattened array, can be a prediction or the truth label. z : np.array Flattened array of the same shape as y. **kwargs Additional keyword arguments. These are not currently used. Returns ------- float The Pearson correlation coefficient between ``y`` and ``z``. Notes ----- The Pearson correlation coefficient measures the linear relationship between two variables. The calculation of the Pearson correlation coefficient is not affected by scaling, and it ranges from -1 to 1. A value of 1 implies a perfect positive correlation, while a value of -1 implies a perfect negative correlation. Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import pearsonr >>> y = np.array([0, 1, 1, 0, 1, 0, 0, 1]) >>> z = 1 - y >>> pearsonr(y, z) -0.9999999999999998 """ return np.corrcoef(y.reshape(1, -1), z.reshape(1, -1))[0, 1]
[docs]def pearsonr_abs(y: np.array, z: np.array, **kwargs) -> float: """ Calculate the absolute value of the Pearson correlation coefficient between two arrays. The protected attribute `z` can be binary or non-binary. It is given by: .. math:: \\text{Pearson}(Y, Z) = \\left|\\frac{\\text{cov}(Y, Z)}{\\sigma_Y \\cdot \\sigma_Z}\\right| where :math:`\\text{cov}(Y, Z)` is the covariance between :math:`Y` and :math:`Z`, and :math:`\\sigma_Y` and :math:`\\sigma_Z` are the respective standard deviations. Parameters ---------- y : np.array Flattened array, can be a prediction or the truth label. z : np.array Flattened array of the same shape as y. **kwargs Additional keyword arguments. These are not currently used. Returns ------- float The absolute value of the Pearson correlation coefficient between ``y`` and ``z``. Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import pearsonr_abs >>> y = np.array([0, 1, 1, 0, 1, 0, 0, 1]) >>> z = 1 - y >>> pearsonr_abs(y, z) 0.9999999999999998 """ return np.abs(pearsonr(y, z))
[docs]def rdc(y: np.array, z: np.array, f=np.sin, k=20, s=1 / 6., n=1, **kwargs): """ The Randomized Dependence Coefficient by David Lopez-Paz, Philipp Hennig, Bernhard Schoelkopf [7]_. According to the paper, the coefficient should be relatively insensitive to the settings of the f, k, and s parameters. Parameters ---------- y : np.array (n_samples,) or (n_samples, n_variables) z : np.array (n_samples,) or (n_samples, n_variables) f : callable function to use for random projection k : int number of random projections to use s : numeric scale parameter n : int number of times to compute the RDC and return the median (for stability) Returns ------- float RDC between ``y`` and ``z``. Notes ----- Implementation by Gary Doran and taken from: https://github.com/garydoranjr/rdc References ---------- .. [7] David Lopez-Paz, Philipp Hennig, Bernhard Schoelkopf. (2013). The Randomized Dependence Coefficient. In Advances in Neural Information Processing Systems 26 (NIPS 2013). http://papers.nips.cc/paper/5138-the-randomized-dependence-coefficient.pdf Examples -------- >>> import numpy as np >>> from fairdo.metrics.dependence import rdc >>> y = np.random.rand(100) >>> z = np.random.rand(100) >>> rdc(y, z) 0.287647809294975 """ if n > 1: values = [] for i in range(n): try: values.append(rdc(y, z, f, k, s, 1)) except np.linalg.linalg.LinAlgError: pass return np.median(values) if len(y.shape) == 1: y = y.reshape((-1, 1)) if len(z.shape) == 1: z = z.reshape((-1, 1)) # Copula Transformation cx = np.column_stack([rankdata(xc, method='ordinal') for xc in y.T]) / float(y.size) cy = np.column_stack([rankdata(yc, method='ordinal') for yc in z.T]) / float(z.size) # Add a vector of ones so that w.x + b is just a dot product O = np.ones(cx.shape[0]) X = np.column_stack([cx, O]) Y = np.column_stack([cy, O]) # Random linear projections Rx = (s / X.shape[1]) * np.random.randn(X.shape[1], k) Ry = (s / Y.shape[1]) * np.random.randn(Y.shape[1], k) X = np.dot(X, Rx) Y = np.dot(Y, Ry) # Apply non-linear function to random projections fX = f(X) fY = f(Y) # Compute full covariance matrix C = np.cov(np.hstack([fX, fY]).T) # Due to numerical issues, if k is too large, # then rank(fX) < k or rank(fY) < k, so we need # to find the largest k such that the eigenvalues # (canonical correlations) are real-valued k0 = k lb = 1 ub = k while True: # Compute canonical correlations Cxx = C[:k, :k] Cyy = C[k0:k0 + k, k0:k0 + k] Cxy = C[:k, k0:k0 + k] Cyx = C[k0:k0 + k, :k] eigs = np.linalg.eigvals(np.dot(np.dot(np.linalg.pinv(Cxx), Cxy), np.dot(np.linalg.pinv(Cyy), Cyx))) # Binary search if k is too large if not (np.all(np.isreal(eigs)) and 0 <= np.min(eigs) and np.max(eigs) <= 1): ub -= 1 k = (ub + lb) // 2 continue if lb == ub: break lb = k if ub == lb + 1: k = ub else: k = (ub + lb) // 2 return np.sqrt(np.max(eigs))