Source code for fairdo.utils.math

"""
Mixed math functions used throughout the package.

References
----------
.. [1] Shannon, C. E. (1948). A mathematical theory of communication. Bell system technical journal, 27(3), 379-423.
"""

import numpy as np


[docs]def entropy_estimate_cat(x: np.array, **kwargs) -> float:
    """Calculate the entropy [1]_ of a categorical variable.
    It is caclulated as:

    .. math::
        H(X) = - \\sum_{i=1}^{n} p(X_i) \\log_2 p(X_i)

    where :math:`p(X_i)` is the probability of the i-th category. The entropy is a measure of the information/uncertainty
    of a random variable. Higher values indicate more information/uncertainty.

    Parameters
    ----------
    x : np.array (n_samples,)
        Array of shape (n_samples,) containing the categorical labels as numerical values.

    Returns
    -------
    float
        The entropy of the label distribution.

    Examples
    --------
    >>> import numpy as np
    >>> from fairdo.utils.math import entropy_estimate_cat
    >>> x = np.array([0, 1, 1, 0, 1, 0, 0, 1])
    >>> entropy_estimate_cat(x)
    1.0
    """
    prob_dist = np.bincount(x) / len(x)
    prob_dist = prob_dist[prob_dist > 0]  # Remove zeros
    return -np.sum(prob_dist * np.log2(prob_dist))


[docs]def joint_entropy_cat(x: np.array):
    """Calculate the joint entropy [1]_ of multiple categorical variables.
    The joint entropy is a measure of the information/surprise/uncertainty of a set of random variables.
    Let :math:`X = (X_1, X_2, \\ldots, X_m)` be a set of categorical variables, i.e.,
    multivariate random variable, then the joint entropy is calculated as:

    .. math::
        H(X) = -\\sum_{x_1 \\in\\mathcal X_1} \\ldots \\sum_{x_m \\in\\mathcal X_m} P(x_1, ..., x_m)
        \\log_2[P(x_1, ..., x_m)]

    Parameters
    ----------
    x : np.array (n_samples, n_variables)
        Array of shape (n_samples, n_variables) containing the labels as numerical values.

    Returns
    -------
    float
        The joint entropy of the categorical variables in the array ``x``.

    Examples
    --------
    >>> import numpy as np
    >>> from fairdo.utils.math import joint_entropy_cat
    >>> x = np.array([[0, 1, 1, 0, 1, 0, 0, 1],
    ...               [0, 1, 1, 0, 1, 0, 0, 1]])
    >>> joint_entropy_cat(x)
    -0.0
    """
    if x.ndim > 1:
        x_idx = np.ravel_multi_index(x.T, np.max(x, axis=0) + 1)
    else:
        x_idx = x
    prob_dist = np.bincount(x_idx) / len(x_idx)
    prob_dist = prob_dist[prob_dist > 0]
    return -np.sum(prob_dist * np.log2(prob_dist))


[docs]def conditional_entropy_cat(x: np.array, y: np.array) -> float:
    """
    Calculate the conditional entropy [1]_ of a categorical variable ``x`` given another categorical variable ``y``,
    i.e.,

    .. math::
        H(X|Y) = H(X, Y) - H(Y)

    where :math:`H(X, Y)` is the joint entropy of the categorical variables ``x`` and ``y``
    and :math:`H(Y)` is the entropy of the variable ``y``.

    Parameters
    ----------
    x : np.array (n_samples,)
        Array of shape (n_samples,) containing the labels.

    y : np.array (n_samples,) or (n_samples, n_variables)
        Array containing the labels. Can represent a single or multiple categorical variables.

    Returns
    -------
    float
        The conditional entropy of the label distribution.

    Examples
    --------
    >>> import numpy as np
    >>> from fairdo.utils.math import conditional_entropy_cat
    >>> x = np.array([0, 1, 1, 0, 1, 0, 0, 1])
    >>> y = np.array([0, 1, 1, 0, 1, 0, 0, 1])
    >>> conditional_entropy_cat(x, y)
    0
    """
    xy = np.column_stack((x, y))

    # Calculate the entropy of the joint distribution H(X, Y)
    H_XY = joint_entropy_cat(xy)

    # Calculate the entropy of Y
    H_Y = joint_entropy_cat(y)

    # Calculate the conditional entropy H(X|Y) = H(X, Y) - H(Y)
    return H_XY - H_Y