Source code for arviz_stats.summary

"""Summaries for various statistics and diagnostics."""

import numpy as np
import xarray as xr
from arviz_base import dataset_to_dataframe, extract, rcParams, references_to_dataset
from xarray_einstats import stats

from arviz_stats.utils import _apply_multi_input_function
from arviz_stats.validate import validate_dims

__all__ = ["summary", "ci_in_rope"]



[docs]
def summary(
    data,
    var_names=None,
    filter_vars=None,
    group="posterior",
    coords=None,
    sample_dims=None,
    kind="all",
    ci_prob=None,
    ci_kind=None,
    round_to=2,
    skipna=False,
):
    """
    Create a data frame with summary statistics and or diagnostics.

    Parameters
    ----------
    data : DataTree, DataSet or InferenceData
    var_names : list of str, optional
        Names of variables to include in summary. If None all variables are included.
    filter_vars: {None, "like", "regex"}, default None
        Used for `var_names` only.
        If ``None`` (default), interpret var_names as the real variables names.
        If "like", interpret var_names as substrings of the real variables names.
        If "regex", interpret var_names as regular expressions on the real variables names.
    group: str
        Select a group for summary. Defaults to “posterior”.
    coords : dict, optional
        Coordinates defining a subset over the selected group.
    sample_dims : str or sequence of hashable, optional
        Defaults to ``rcParams["data.sample_dims"]``
    kind: {'all', 'stats', 'diagnostics', 'all_median', 'stats_median',
    'diagnostics_median', 'mc_diagnostics'}, default 'all'
        * ``all``: `mean`, `sd`, `ci`, `ess_bulk`, `ess_tail`, `r_hat`, `mcse_mean`, `mcse_sd`.
        * ``stats``: `mean`, `sd`, and `ci`.
        * ``diagnostics``: `ess_bulk`, `ess_tail`, `r_hat`, `mcse_mean`, `mcse_sd`.
        * ``all_median``: `median`, `mad`, `ci`, `ess_median`, `ess_tail`, `r_hat`, `mcse_median`.
        * ``stats_median``: `median`, `mad`, and `ci`.
        * ``diagnostics_median``: `ess_median`, `ess_tail`, `r_hat`, `mcse_median`.
        * ``mc_diagnostics``: `mcse_mean`, `ess_mean`, and `min_ss`.
    ci_prob : float, optional
        Probability for the credible interval. Defaults to ``rcParams["stats.ci_prob"]``.
    ci_kind : {"hdi", "eti"}, optional
        Type of credible interval. Defaults to ``rcParams["stats.ci_kind"]``.
        If `kind` is stats_median or all_median, `ci_kind` is forced to "eti".
    round_to : int
        Number of decimals used to round results. Defaults to 2. Use "none" to return raw numbers.
    skipna: bool
        If true ignores nan values when computing the summary statistics. Defaults to false.

    Returns
    -------
    pandas.DataFrame

    See Also
    --------
    arviz.rhat : Compute estimate of rank normalized split R-hat for a set of traces.
    arviz.ess : Calculate the effective sample size of a set of traces.
    arviz.mcse : Calculate Markov Chain Standard Error statistic.
    plot_ess : Plot quantile, local or evolution of effective sample sizes (ESS).
    plot_mcse : Plot quantile, local or evolution of Markov Chain Standard Error (MCSE).


    Examples
    --------
    .. ipython::

        In [1]: from arviz_base import load_arviz_data
           ...: from arviz_stats import summary
           ...: data = load_arviz_data("non_centered_eight")
           ...: summary(data, var_names=["mu", "tau"])

    You can use ``filter_vars`` to select variables without having to specify all the exact
    names. Use ``filter_vars="like"`` to select based on partial naming:

    .. ipython::

        In [1]: summary(data, var_names=["the"], filter_vars="like")

    Use ``filter_vars="regex"`` to select based on regular expressions, and prefix the variables
    you want to exclude by ``~``. Here, we exclude from the summary all the variables
    starting with the letter t:

    .. ipython::

        In [1]: summary(data, var_names=["~^t"], filter_vars="regex")

    """
    if ci_kind not in ("hdi", "eti", None):
        raise ValueError("ci_kind must be either 'hdi' or 'eti'")

    kinds = [
        "all",
        "stats",
        "diagnostics",
        "all_median",
        "stats_median",
        "diagnostics_median",
        "mc_diagnostics",
    ]
    if kind not in kinds:
        raise ValueError(
            "valid options for kind are: " + ", ".join(kinds[:-1]) + " or " + kinds[-1]
        )

    if ci_prob is None:
        ci_prob = rcParams["stats.ci_prob"]
    if ci_kind is None:
        ci_kind = rcParams["stats.ci_kind"]
    if sample_dims is None:
        sample_dims = rcParams["data.sample_dims"]

    ci_perc = int(ci_prob * 100)

    dataset = extract(
        data,
        var_names=var_names,
        filter_vars=filter_vars,
        group=group,
        sample_dims=sample_dims,
        combined=False,
        keep_dataset=True,
    )
    if coords is not None:
        dataset = dataset.sel(coords)

    to_concat = []

    if kind in ["stats", "all"]:
        mean = dataset.mean(dim=sample_dims, skipna=skipna).expand_dims(summary=["mean"])
        std = dataset.std(dim=sample_dims, skipna=skipna).expand_dims(summary=["sd"])
        ci = (
            dataset.azstats.eti(prob=ci_prob, dim=sample_dims, skipna=skipna)
            .rename({"ci_bound": "summary"})
            .assign_coords(summary=[f"{ci_kind}{ci_perc}_lb", f"{ci_kind}{ci_perc}_ub"])
        )

        to_concat.extend((mean, std, ci))

    if kind in ["diagnostics", "all"]:
        ess_bulk = dataset.azstats.ess(sample_dims=sample_dims, method="bulk").expand_dims(
            summary=["ess_bulk"]
        )
        ess_tail = dataset.azstats.ess(sample_dims=sample_dims, method="tail").expand_dims(
            summary=["ess_tail"]
        )
        rhat = dataset.azstats.rhat(sample_dims=sample_dims).expand_dims(summary=["r_hat"])
        mcse_mean = dataset.azstats.mcse(sample_dims=sample_dims, method="mean").expand_dims(
            summary=["mcse_mean"]
        )
        mcse_sd = dataset.azstats.mcse(sample_dims=sample_dims, method="sd").expand_dims(
            summary=["mcse_sd"]
        )

        to_concat.extend((ess_bulk, ess_tail, rhat, mcse_mean, mcse_sd))

    if kind in ["stats_median", "all_median"]:
        median = dataset.median(dim=sample_dims, skipna=skipna).expand_dims(summary=["median"])
        mad = stats.median_abs_deviation(
            dataset, dims=("chain", "draw"), nan_policy="omit" if skipna else "propagate"
        ).expand_dims(summary=["mad"])
        ci = (
            dataset.azstats.eti(prob=ci_prob, dim=sample_dims, skipna=skipna)
            .rename({"ci_bound": "summary"})
            .assign_coords(summary=[f"eti{ci_perc}_lb", f"eti{ci_perc}_ub"])
        )

        to_concat.extend((median, mad, ci))

    if kind in ["diagnostics_median", "all_median"]:
        ess_median = dataset.azstats.ess(sample_dims=sample_dims, method="median").expand_dims(
            summary=["ess_median"]
        )
        ess_tail = dataset.azstats.ess(sample_dims=sample_dims, method="tail").expand_dims(
            summary=["ess_tail"]
        )
        rhat = dataset.azstats.rhat(sample_dims=sample_dims).expand_dims(summary=["r_hat"])
        mcse_median = dataset.azstats.mcse(sample_dims=sample_dims, method="median").expand_dims(
            summary=["mcse_median"]
        )

        to_concat.extend((ess_median, ess_tail, rhat, mcse_median))

    if kind == "mc_diagnostics":
        mcse_mean = dataset.azstats.mcse(sample_dims=sample_dims, method="mean").expand_dims(
            summary=["mcse_mean"]
        )
        ess_mean = dataset.azstats.ess(sample_dims=sample_dims, method="mean").expand_dims(
            summary=["ess_mean"]
        )
        min_ss = dataset.azstats.pareto_min_ss(sample_dims=sample_dims).expand_dims(
            summary=["min_ss"]
        )

        to_concat.extend((mcse_mean, ess_mean, min_ss))

    summary_df = dataset_to_dataframe(
        xr.concat(to_concat, dim="summary"), sample_dims=["summary"]
    ).T
    if (round_to is not None) and (round_to not in ("None", "none")):
        summary_df = summary_df.round(round_to)

    return summary_df




[docs]
def ci_in_rope(
    data,
    rope,
    var_names=None,
    filter_vars=None,
    group="posterior",
    dim=None,
    ci_prob=None,
    ci_kind=None,
    rope_dim="rope_dim",
):
    """
    Compute the percentage of a credible interval that falls within a ROPE.

    A region of practical equivalence (ROPE) indicates a small range of parameter values
    that are considered to be practically equivalent to the null value for purposes of the
    particular application see [1]_ for more details.

    Parameters
    ----------
    data : DataTree, DataSet or InferenceData
    rope : (2,) array-like or dict of {hashable : (2,) array-like} or Dataset
        If tuple, the lower and upper bounds of the ROPE are the same for all variables.
        If dict, the keys are the variable names and the values are tuples with the lower
        and upper bounds of the ROPE. The keys must be in `var_names`.
    var_names : list of str, optional
        Names of variables for which the ROPE should be computed.
        If None all variables are included.
    filter_vars: {None, "like", "regex"}, default None
        Used for `var_names` only.
        If ``None`` (default), interpret var_names as the real variables names.
        If "like", interpret var_names as substrings of the real variables names.
        If "regex", interpret var_names as regular expressions on the real variables names.
    group: str
        Select a group to compute the ROPE. Defaults to “posterior”.
    coords : dict, optional
        Coordinates defining a subset over the selected group.
    dim : str or sequence of hashable, optional
        Defaults to ``rcParams["data.sample_dims"]``
    ci_prob : float, optional
        Probability for the credible interval. Defaults to ``rcParams["stats.ci_prob"]``.
    ci_kind : {"hdi", "eti"}, optional
        Type of credible interval. Defaults to ``rcParams["stats.ci_kind"]``.
        If `kind` is stats_median or all_median, `ci_kind` is forced to "eti".
    rope_dim : str, default "rope_dim"
        Name for the dimension containing the ROPE values. Only used when `rope`
        is a :class:`~xarray.Dataset`

    Returns
    -------
    xarray.Dataset

    See Also
    --------
    arviz.summary : Compute summary statistics and or diagnostics.
    arviz.hdi : Compute highest density interval (HDI).
    arviz.eti : Compute equal tail interval (ETI).

    Examples
    --------
    Apply the same ROPE to a subset of variables:

    .. ipython::

        In [1]: from arviz_base import load_arviz_data
           ...: from arviz_stats import ci_in_rope
           ...: data = load_arviz_data("centered_eight")
           ...: ci_in_rope(data, var_names=["mu", "tau"], rope=(-0.5, 0.5))

    Apply different ROPEs to each variable:

    .. ipython::

        In [1]: ci_in_rope(data, rope={"mu": (-0.5, 0.5), "tau": (0.1, 0.2), "theta": (-0.1, 0.1)})

    References
    ----------
    .. [1] Kruschke. Doing Bayesian Data Analysis, Second Edition: A Tutorial with R,
        JAGS, and Stan. Academic Press, 2014. ISBN 978-0-12-405888-0.
    """
    sample_dims = validate_dims(dim)

    dataset = extract(
        data,
        var_names=var_names,
        filter_vars=filter_vars,
        group=group,
        sample_dims=sample_dims,
        combined=False,
        keep_dataset=True,
    )

    if isinstance(rope, dict):
        if not all(var in dataset.data_vars for var in rope.keys()):
            raise ValueError("`rope` must be a subset of the variables in `data`")
        if not all(isinstance(v, tuple) and len(v) == 2 for v in rope.values()):
            raise ValueError("`rope` must be a dict of tuples of length 2")
    elif isinstance(rope, xr.Dataset):
        if rope_dim not in rope.dims:
            raise ValueError(f"{rope_dim} is not a dimension of `rope`")
        if (rope_len := rope.sizes[rope_dim]) != 2:
            raise ValueError(f"Length of {rope_dim} dim must be 2 but is {rope_len}")
    else:
        try:
            rope = np.array(rope)
        except ValueError as err:
            raise ValueError(
                "`rope` must be a dict, Dataset or array-like, failed to convert to array"
            ) from err
        if len(rope) != 2:
            raise ValueError("`rope` must be a tuple of length 2")

    if ci_kind == "eti":
        c_i = dataset.azstats.eti(prob=ci_prob, dim=sample_dims)
    else:
        c_i = dataset.azstats.hdi(prob=ci_prob, dim=sample_dims)

    ci_low = c_i.sel(ci_bound="lower")
    ci_high = c_i.sel(ci_bound="upper")

    in_ci = (dataset >= ci_low) & (dataset <= ci_high)

    rope = references_to_dataset(rope, dataset, sample_dims=sample_dims, ref_dim=rope_dim)

    ci_samples = dataset.where(in_ci)
    in_rope = (ci_samples >= rope.sel({rope_dim: 0})) & (ci_samples <= rope.sel({rope_dim: 1}))

    proportion = (in_rope.sum(dim=sample_dims) / in_ci.sum(dim=sample_dims)) * 100

    return proportion




[docs]
def mode(
    data,
    dim=None,
    group="posterior",
    var_names=None,
    filter_vars=None,
    coords=None,
    **kwargs,
):
    r"""Compute the mode.

    The mode is the value that appears most frequently in a data set.
    If the data is of type float, we assume it is continuous and use the half-sample method [1]_.
    If the data is of type int, we assume it is discrete and use :func:`numpy.unique` to find the
    most frequent value.


    Parameters
    ----------
    data : array-like, DataArray, Dataset, DataTree, DataArrayGroupBy, DatasetGroupBy, or idata-like
        Input data. It will have different pre-processing applied to it depending on its type:

        - array-like: call array layer within ``arviz-stats``.
        - xarray object: apply dimension aware function to all relevant subsets
        - others: passed to :func:`arviz_base.convert_to_dataset` then treated as
          :class:`xarray.Dataset`. This option is discouraged due to needing this conversion
          which is completely automated and will be needed again in future executions or
          similar functions.

          It is recommended to first perform the conversion manually and then call
          ``arviz_stats.mode``. This allows controlling the conversion step and inspecting
          its results.
    dim : sequence of hashable, optional
        Dimensions over which to compute the mode. Defaults to ``rcParams["data.sample_dims"]``.
    group : hashable, default "posterior"
        Group on which to compute the mode
    var_names : str or list of str, optional
        Names of the variables for which the mode should be computed.
    filter_vars : {None, "like", "regex"}, default None
    coords : dict, optional
        Dictionary of dimension/index names to coordinate values defining a subset
        of the data for which to perform the computation.
    **kwargs : any, optional
        Forwarded to the array or dataarray interface for mode.

    Returns
    -------
    ndarray, DataArray, Dataset, DataTree
        Requested mode of the provided input.

    See Also
    --------
    :func:`xarray.Dataset.mean`, :func:`xarray.Dataset.median`


    References
    ----------
    .. [1] Bickel DR, Fruehwirth R. *On a Fast, Robust Estimator of the Mode:
       Comparisons to Other Robust Estimators with Applications.* Computational Statistics
       & Data Analysis. 2006. https://doi.org/10.1016/j.csda.2005.07.011
       arXiv preprint https://doi.org/10.48550/arXiv.math/0505419

    Examples
    --------
    Calculate the mode of a Normal random variable:

    .. ipython::

        In [1]: import arviz_stats as azs
           ...: import numpy as np
           ...: data = np.random.default_rng().normal(size=2000)
           ...: azs.mode(data)

    Calculate the modes for specific variables:

    .. ipython::

        In [1]: import arviz_base as azb
           ...: dt = azb.load_arviz_data("centered_eight")
           ...: azs.mode(dt, var_names=["mu", "theta"])

    Calculate the modes excluding the school dimension:

    .. ipython::

        In [1]: azs.mode(dt, dim=["chain", "draw"])
    """
    return _apply_multi_input_function(
        "mode",
        data,
        dim,
        "dim",
        group=group,
        var_names=var_names,
        filter_vars=filter_vars,
        coords=coords,
        **kwargs,
    )