Source code for infovar.stats.resampling

from abc import ABC, abstractmethod
from math import floor
from random import shuffle

import numpy as np

from .statistics import Statistic

__all__ = [
    "Resampling",
    "Bootstrapping",
    "Subsampling"
]



[docs]
class Resampling(ABC):


[docs]
    @abstractmethod
    def compute_sigma(
        self,
        variables: np.ndarray,
        targets: np.ndarray,
        stat: Statistic,
        **kwargs
    ) -> float:
        """
        Estimates the standard deviation of the estimator `stat`.

        Parameters
        ----------
        variables : np.ndarray
            Variable data. Must be a 2D array.
        targets : np.ndarray
            Target data. Must be a 2D array with the same number of rows than `variables`.
        stat : Statistic
            Estimator whose variance is to be estimated.

        Returns
        -------
        float
            Estimate of estimator standard deviation.
        """
        pass




[docs]
class Bootstrapping(Resampling):


[docs]
    def compute_sigma(
        self,
        variables: np.ndarray,
        targets: np.ndarray,
        stat: Statistic,
        n: int=10,
    ) -> float:
        """
        Estimates the standard deviation of the estimator `stat` using by bootstrap. This method permits to estimate the variance of an estimator for a given data distribution. It consists in creating new datasets from the same distribution by drawing with replacement samples from existing data.

        Parameters
        ----------
        variables : np.ndarray
            Variable data. Must be a 2D array.
        targets : np.ndarray
            Target data. Must be a 2D array with the same number of rows than `variables`.
        stat : Statistic
            Estimator whose variance is to be estimated.
        n : int, optional
            Number of bootstrap samples, by default 10

        Returns
        -------
        float
            Estimate of estimator standard deviation.
        """
        assert isinstance(variables, np.ndarray) and variables.ndim == 2
        assert isinstance(targets, np.ndarray) and targets.ndim == 2
        assert variables.shape[0] == targets.shape[0]
        assert isinstance(stat, Statistic)

        if n < 2:
            raise ValueError(f"Number of different subset size `n` must be greater than or equal to 2 (here: {n})")

        values = np.zeros(n)
        for i in range(n):
            idx = np.random.choice(variables.shape[0], variables.shape[0], replace=True)
            values[i] = stat(variables[idx], targets[idx])
        return np.std(values)




[docs]
class Subsampling(Resampling):


[docs]
    def compute_sigma(
        self,
        variables: np.ndarray,
        targets: np.ndarray,
        stat: Statistic,
        n: int=5,
        min_samples: int=20,
        min_subsets: int=5,
        decades: float=2
    ) -> float:
        """
        Estimates the standard deviation of the estimator `stat` using the approach proposed in Holmes, C. M., & Nemenman, I. (2019). It assumes that the variance of the estimator depends on the number of samples N as Var[stat](N) = B/N, with B being a parameter to be estimated that depends on the data distribution.
        This function assumes that the previous relation is true for the given estimator and compute its variance for several number of samples N by subsampling the dataset. This permit to estimate the value of B.

        Parameters
        ----------
        variables : np.ndarray
            Variable data. Must be a 2D array.
        targets : np.ndarray
            Target data. Must be a 2D array with the same number of rows than `variables`.
        stat : Statistic
            Estimator whose variance is to be estimated.
        n : int, optional
            Number of different subset sizes, by default 5.
        min_samples : int, optional
            Minimum number of samples required for a subset, by default 20.
        min_subsets : int, optional
            Minimum number of subsets for a given subset size, by default 5.
        decades : float, optional
            Maximum orders of magnitude between the largest and smallest subset sizes, by default 2.

        Returns
        -------
        float
            Estimate of estimator standard deviation.
        """
        assert isinstance(variables, np.ndarray) and variables.ndim == 2
        assert isinstance(targets, np.ndarray) and targets.ndim == 2
        assert variables.shape[0] == targets.shape[0]
        assert isinstance(stat, Statistic)
        if n < 2:
            raise ValueError(f"Number of different subset size `n` must be greater than or equal to 2 (here: {n})")
        if min_samples > variables.shape[0]:
            raise ValueError(f"Number of samples {variables.shape[0]} must be greater than or equal to `min_samples` ({min_samples})")

        N = variables.shape[0]

        # Max subset size
        max_Ni = floor(N / min_subsets)
        # Minimum subset size
        min_Ni = max(min_samples, 10**(-decades) * max_Ni)
        # Defines subset sizes
        Nis = np.logspace(np.log10(min_Ni), np.log10(max_Ni), n).astype(int)
        # Keep only distincts sizes (in case where the number of sample is not high enough to get `n` different sizes).
        Nis = np.unique(Nis)
        # Check that there is enough subset sizes to estimate B
        if Nis.size < 2:
            return np.nan

        # Random subsampling
        ni_list = [] # List of number of subset for the different subset sizes
        sigma2_list = [] # List of variances for the different subset sizes
        idx = list(range(N))
        for Ni in Nis:
            ni = floor(N / Ni) # Number of subset
            ni_list.append(ni)
            shuffle(idx)
            mis = []
            for i in range(ni):
                sidx = idx[i*Ni:(i+1)*Ni]
                mis.append(stat(
                    variables[sidx],
                    targets[sidx]
                ))
            sigma2_list.append(np.var(mis))

        nis = np.array(ni_list)
        sigma2s = np.array(sigma2_list)

        # Eq. (8) in Holmes, C. M., & Nemenman, I. (2019)
        return np.sqrt( np.sum((nis-1)/nis * sigma2s) / np.sum(nis - 1) )