"""
Mixed math functions used throughout the package.
References
----------
.. [1] Shannon, C. E. (1948). A mathematical theory of communication. Bell system technical journal, 27(3), 379-423.
"""
import numpy as np
[docs]def entropy_estimate_cat(x: np.array, **kwargs) -> float:
"""Calculate the entropy [1]_ of a categorical variable.
It is caclulated as:
.. math::
H(X) = - \\sum_{i=1}^{n} p(X_i) \\log_2 p(X_i)
where :math:`p(X_i)` is the probability of the i-th category. The entropy is a measure of the information/uncertainty
of a random variable. Higher values indicate more information/uncertainty.
Parameters
----------
x : np.array (n_samples,)
Array of shape (n_samples,) containing the categorical labels as numerical values.
Returns
-------
float
The entropy of the label distribution.
Examples
--------
>>> import numpy as np
>>> from fairdo.utils.math import entropy_estimate_cat
>>> x = np.array([0, 1, 1, 0, 1, 0, 0, 1])
>>> entropy_estimate_cat(x)
1.0
"""
prob_dist = np.bincount(x) / len(x)
prob_dist = prob_dist[prob_dist > 0] # Remove zeros
return -np.sum(prob_dist * np.log2(prob_dist))
[docs]def joint_entropy_cat(x: np.array):
"""Calculate the joint entropy [1]_ of multiple categorical variables.
The joint entropy is a measure of the information/surprise/uncertainty of a set of random variables.
Let :math:`X = (X_1, X_2, \\ldots, X_m)` be a set of categorical variables, i.e.,
multivariate random variable, then the joint entropy is calculated as:
.. math::
H(X) = -\\sum_{x_1 \\in\\mathcal X_1} \\ldots \\sum_{x_m \\in\\mathcal X_m} P(x_1, ..., x_m)
\\log_2[P(x_1, ..., x_m)]
Parameters
----------
x : np.array (n_samples, n_variables)
Array of shape (n_samples, n_variables) containing the labels as numerical values.
Returns
-------
float
The joint entropy of the categorical variables in the array ``x``.
Examples
--------
>>> import numpy as np
>>> from fairdo.utils.math import joint_entropy_cat
>>> x = np.array([[0, 1, 1, 0, 1, 0, 0, 1],
... [0, 1, 1, 0, 1, 0, 0, 1]])
>>> joint_entropy_cat(x)
-0.0
"""
if x.ndim > 1:
x_idx = np.ravel_multi_index(x.T, np.max(x, axis=0) + 1)
else:
x_idx = x
prob_dist = np.bincount(x_idx) / len(x_idx)
prob_dist = prob_dist[prob_dist > 0]
return -np.sum(prob_dist * np.log2(prob_dist))
[docs]def conditional_entropy_cat(x: np.array, y: np.array) -> float:
"""
Calculate the conditional entropy [1]_ of a categorical variable ``x`` given another categorical variable ``y``,
i.e.,
.. math::
H(X|Y) = H(X, Y) - H(Y)
where :math:`H(X, Y)` is the joint entropy of the categorical variables ``x`` and ``y``
and :math:`H(Y)` is the entropy of the variable ``y``.
Parameters
----------
x : np.array (n_samples,)
Array of shape (n_samples,) containing the labels.
y : np.array (n_samples,) or (n_samples, n_variables)
Array containing the labels. Can represent a single or multiple categorical variables.
Returns
-------
float
The conditional entropy of the label distribution.
Examples
--------
>>> import numpy as np
>>> from fairdo.utils.math import conditional_entropy_cat
>>> x = np.array([0, 1, 1, 0, 1, 0, 0, 1])
>>> y = np.array([0, 1, 1, 0, 1, 0, 0, 1])
>>> conditional_entropy_cat(x, y)
0
"""
xy = np.column_stack((x, y))
# Calculate the entropy of the joint distribution H(X, Y)
H_XY = joint_entropy_cat(xy)
# Calculate the entropy of Y
H_Y = joint_entropy_cat(y)
# Calculate the conditional entropy H(X|Y) = H(X, Y) - H(Y)
return H_XY - H_Y