Source code for infovar.stats.ranking

from typing import Optional, Union

import numpy as np
from tqdm import tqdm

from scipy import stats, integrate

__all__ = [
    "prob_higher"
]


[docs]
def prob_higher(
    mus: np.ndarray,
    sigmas: np.ndarray,
    idx: Optional[int]=None,
    approx: bool=True,
    pbar: bool=False,
) -> Union[np.ndarray, float]:
    """
    Returns the probability of a given estimation (described by an estimated value and a standard deviation) to be the highest among all provided estimations.
    The argument `idx` specifies the index of the estimation whose probability to be the highest has to be computed. If None, returns the probability for every provided estimation. 
    Source: https://stats.stackexchange.com/questions/44139/what-is-px-1x-2-x-1x-3-x-1x-n

    Parameters
    ----------
    mus : np.ndarray
        Estimates.
    sigmas : np.ndarray
        Uncertainty of estimates (1 sigma).
    idx : Optional[int], optional
        _description_, by default None
    approx : bool, optional
        If True, neglects estimates above three sigma. Default: True.
    pbar : bool, optional
        If True, displays a progress bar. Default: False

    Returns
    -------
    Union[np.ndarray, float]
        If `idx` is an integer, probability of the i-th estimate to be the highest. If `idx` is None, array of probability for each estimate.
    """
    if not isinstance(mus, np.ndarray):
        mus = np.array(mus)
    if not isinstance(sigmas, np.ndarray):
        sigmas = np.array(sigmas)

    assert mus.shape == sigmas.shape
    n_vars = mus.size

    def integrand(t: np.ndarray, i: int) -> np.ndarray:
        res = stats.norm.logpdf(t, loc=mus[i], scale=sigmas[i])
        for j in range(n_vars):
            if j != i:
                res += stats.norm.logcdf(t, loc=mus[j], scale=sigmas[j])
        return np.exp(res)
    # We restrict to a bounded interval to prevent large integration errors due to the small domain where the integrand is not zero
    n_sigma = 5

    # In approx mode, only the probabilities of line combinations whose +/- n_sigma_approx intervals overlap are calculated
    if approx:
        i_higher = np.argmax(mus)
        n_sigma_approx = 1.5
        significant = (mus + n_sigma_approx*sigmas) > (mus[i_higher] - n_sigma_approx*sigmas[i_higher])
    else:
        significant = np.ones_like(mus, dtype=bool)

    # If the user wants to compute all probabilities
    if idx is None:
        probs = np.zeros(n_vars)# * np.nan
        _iterable = tqdm(range(n_vars)) if pbar else range(n_vars)
        for i in _iterable:
            if  significant[i]:
                bounds = (mus[i]-n_sigma*sigmas[i], mus[i]+n_sigma*sigmas[i])
                fun = lambda t: integrand(t, i)
                p = integrate.quad(fun, *bounds)[0]
                probs[i] = p
        return probs / np.sum(probs)
    
    # If the user has asked for a particular probability
    bounds = (mus[idx]-n_sigma*sigmas[idx], mus[idx]+n_sigma*sigmas[idx])
    fun = lambda t: integrand(t, idx)
    return integrate.quad(fun, *bounds)[0]