Source code for arviz_stats.summary

"""Summaries for various statistics and diagnostics."""

import numpy as np
import xarray as xr
from arviz_base import dataset_to_dataframe, extract, rcParams, references_to_dataset
from xarray_einstats import stats

from arviz_stats.utils import _apply_multi_input_function
from arviz_stats.validate import validate_dims

__all__ = ["summary", "ci_in_rope"]


[docs] def summary( data, var_names=None, filter_vars=None, group="posterior", coords=None, sample_dims=None, kind="all", ci_prob=None, ci_kind=None, round_to=2, skipna=False, ): """ Create a data frame with summary statistics and or diagnostics. Parameters ---------- data : DataTree, DataSet or InferenceData var_names : list of str, optional Names of variables to include in summary. If None all variables are included. filter_vars: {None, "like", "regex"}, default None Used for `var_names` only. If ``None`` (default), interpret var_names as the real variables names. If "like", interpret var_names as substrings of the real variables names. If "regex", interpret var_names as regular expressions on the real variables names. group: str Select a group for summary. Defaults to “posterior”. coords : dict, optional Coordinates defining a subset over the selected group. sample_dims : str or sequence of hashable, optional Defaults to ``rcParams["data.sample_dims"]`` kind: {'all', 'stats', 'diagnostics', 'all_median', 'stats_median', 'diagnostics_median', 'mc_diagnostics'}, default 'all' * ``all``: `mean`, `sd`, `ci`, `ess_bulk`, `ess_tail`, `r_hat`, `mcse_mean`, `mcse_sd`. * ``stats``: `mean`, `sd`, and `ci`. * ``diagnostics``: `ess_bulk`, `ess_tail`, `r_hat`, `mcse_mean`, `mcse_sd`. * ``all_median``: `median`, `mad`, `ci`, `ess_median`, `ess_tail`, `r_hat`, `mcse_median`. * ``stats_median``: `median`, `mad`, and `ci`. * ``diagnostics_median``: `ess_median`, `ess_tail`, `r_hat`, `mcse_median`. * ``mc_diagnostics``: `mcse_mean`, `ess_mean`, and `min_ss`. ci_prob : float, optional Probability for the credible interval. Defaults to ``rcParams["stats.ci_prob"]``. ci_kind : {"hdi", "eti"}, optional Type of credible interval. Defaults to ``rcParams["stats.ci_kind"]``. If `kind` is stats_median or all_median, `ci_kind` is forced to "eti". round_to : int Number of decimals used to round results. Defaults to 2. Use "none" to return raw numbers. skipna: bool If true ignores nan values when computing the summary statistics. Defaults to false. Returns ------- pandas.DataFrame See Also -------- arviz.rhat : Compute estimate of rank normalized split R-hat for a set of traces. arviz.ess : Calculate the effective sample size of a set of traces. arviz.mcse : Calculate Markov Chain Standard Error statistic. plot_ess : Plot quantile, local or evolution of effective sample sizes (ESS). plot_mcse : Plot quantile, local or evolution of Markov Chain Standard Error (MCSE). Examples -------- .. ipython:: In [1]: from arviz_base import load_arviz_data ...: from arviz_stats import summary ...: data = load_arviz_data("non_centered_eight") ...: summary(data, var_names=["mu", "tau"]) You can use ``filter_vars`` to select variables without having to specify all the exact names. Use ``filter_vars="like"`` to select based on partial naming: .. ipython:: In [1]: summary(data, var_names=["the"], filter_vars="like") Use ``filter_vars="regex"`` to select based on regular expressions, and prefix the variables you want to exclude by ``~``. Here, we exclude from the summary all the variables starting with the letter t: .. ipython:: In [1]: summary(data, var_names=["~^t"], filter_vars="regex") """ if ci_kind not in ("hdi", "eti", None): raise ValueError("ci_kind must be either 'hdi' or 'eti'") kinds = [ "all", "stats", "diagnostics", "all_median", "stats_median", "diagnostics_median", "mc_diagnostics", ] if kind not in kinds: raise ValueError( "valid options for kind are: " + ", ".join(kinds[:-1]) + " or " + kinds[-1] ) if ci_prob is None: ci_prob = rcParams["stats.ci_prob"] if ci_kind is None: ci_kind = rcParams["stats.ci_kind"] if sample_dims is None: sample_dims = rcParams["data.sample_dims"] ci_perc = int(ci_prob * 100) dataset = extract( data, var_names=var_names, filter_vars=filter_vars, group=group, sample_dims=sample_dims, combined=False, keep_dataset=True, ) if coords is not None: dataset = dataset.sel(coords) to_concat = [] if kind in ["stats", "all"]: mean = dataset.mean(dim=sample_dims, skipna=skipna).expand_dims(summary=["mean"]) std = dataset.std(dim=sample_dims, skipna=skipna).expand_dims(summary=["sd"]) ci = ( dataset.azstats.eti(prob=ci_prob, dim=sample_dims, skipna=skipna) .rename({"ci_bound": "summary"}) .assign_coords(summary=[f"{ci_kind}{ci_perc}_lb", f"{ci_kind}{ci_perc}_ub"]) ) to_concat.extend((mean, std, ci)) if kind in ["diagnostics", "all"]: ess_bulk = dataset.azstats.ess(sample_dims=sample_dims, method="bulk").expand_dims( summary=["ess_bulk"] ) ess_tail = dataset.azstats.ess(sample_dims=sample_dims, method="tail").expand_dims( summary=["ess_tail"] ) rhat = dataset.azstats.rhat(sample_dims=sample_dims).expand_dims(summary=["r_hat"]) mcse_mean = dataset.azstats.mcse(sample_dims=sample_dims, method="mean").expand_dims( summary=["mcse_mean"] ) mcse_sd = dataset.azstats.mcse(sample_dims=sample_dims, method="sd").expand_dims( summary=["mcse_sd"] ) to_concat.extend((ess_bulk, ess_tail, rhat, mcse_mean, mcse_sd)) if kind in ["stats_median", "all_median"]: median = dataset.median(dim=sample_dims, skipna=skipna).expand_dims(summary=["median"]) mad = stats.median_abs_deviation( dataset, dims=("chain", "draw"), nan_policy="omit" if skipna else "propagate" ).expand_dims(summary=["mad"]) ci = ( dataset.azstats.eti(prob=ci_prob, dim=sample_dims, skipna=skipna) .rename({"ci_bound": "summary"}) .assign_coords(summary=[f"eti{ci_perc}_lb", f"eti{ci_perc}_ub"]) ) to_concat.extend((median, mad, ci)) if kind in ["diagnostics_median", "all_median"]: ess_median = dataset.azstats.ess(sample_dims=sample_dims, method="median").expand_dims( summary=["ess_median"] ) ess_tail = dataset.azstats.ess(sample_dims=sample_dims, method="tail").expand_dims( summary=["ess_tail"] ) rhat = dataset.azstats.rhat(sample_dims=sample_dims).expand_dims(summary=["r_hat"]) mcse_median = dataset.azstats.mcse(sample_dims=sample_dims, method="median").expand_dims( summary=["mcse_median"] ) to_concat.extend((ess_median, ess_tail, rhat, mcse_median)) if kind == "mc_diagnostics": mcse_mean = dataset.azstats.mcse(sample_dims=sample_dims, method="mean").expand_dims( summary=["mcse_mean"] ) ess_mean = dataset.azstats.ess(sample_dims=sample_dims, method="mean").expand_dims( summary=["ess_mean"] ) min_ss = dataset.azstats.pareto_min_ss(sample_dims=sample_dims).expand_dims( summary=["min_ss"] ) to_concat.extend((mcse_mean, ess_mean, min_ss)) summary_df = dataset_to_dataframe( xr.concat(to_concat, dim="summary"), sample_dims=["summary"] ).T if (round_to is not None) and (round_to not in ("None", "none")): summary_df = summary_df.round(round_to) return summary_df
[docs] def ci_in_rope( data, rope, var_names=None, filter_vars=None, group="posterior", dim=None, ci_prob=None, ci_kind=None, rope_dim="rope_dim", ): """ Compute the percentage of a credible interval that falls within a ROPE. A region of practical equivalence (ROPE) indicates a small range of parameter values that are considered to be practically equivalent to the null value for purposes of the particular application see [1]_ for more details. Parameters ---------- data : DataTree, DataSet or InferenceData rope : (2,) array-like or dict of {hashable : (2,) array-like} or Dataset If tuple, the lower and upper bounds of the ROPE are the same for all variables. If dict, the keys are the variable names and the values are tuples with the lower and upper bounds of the ROPE. The keys must be in `var_names`. var_names : list of str, optional Names of variables for which the ROPE should be computed. If None all variables are included. filter_vars: {None, "like", "regex"}, default None Used for `var_names` only. If ``None`` (default), interpret var_names as the real variables names. If "like", interpret var_names as substrings of the real variables names. If "regex", interpret var_names as regular expressions on the real variables names. group: str Select a group to compute the ROPE. Defaults to “posterior”. coords : dict, optional Coordinates defining a subset over the selected group. dim : str or sequence of hashable, optional Defaults to ``rcParams["data.sample_dims"]`` ci_prob : float, optional Probability for the credible interval. Defaults to ``rcParams["stats.ci_prob"]``. ci_kind : {"hdi", "eti"}, optional Type of credible interval. Defaults to ``rcParams["stats.ci_kind"]``. If `kind` is stats_median or all_median, `ci_kind` is forced to "eti". rope_dim : str, default "rope_dim" Name for the dimension containing the ROPE values. Only used when `rope` is a :class:`~xarray.Dataset` Returns ------- xarray.Dataset See Also -------- arviz.summary : Compute summary statistics and or diagnostics. arviz.hdi : Compute highest density interval (HDI). arviz.eti : Compute equal tail interval (ETI). Examples -------- Apply the same ROPE to a subset of variables: .. ipython:: In [1]: from arviz_base import load_arviz_data ...: from arviz_stats import ci_in_rope ...: data = load_arviz_data("centered_eight") ...: ci_in_rope(data, var_names=["mu", "tau"], rope=(-0.5, 0.5)) Apply different ROPEs to each variable: .. ipython:: In [1]: ci_in_rope(data, rope={"mu": (-0.5, 0.5), "tau": (0.1, 0.2), "theta": (-0.1, 0.1)}) References ---------- .. [1] Kruschke. Doing Bayesian Data Analysis, Second Edition: A Tutorial with R, JAGS, and Stan. Academic Press, 2014. ISBN 978-0-12-405888-0. """ sample_dims = validate_dims(dim) dataset = extract( data, var_names=var_names, filter_vars=filter_vars, group=group, sample_dims=sample_dims, combined=False, keep_dataset=True, ) if isinstance(rope, dict): if not all(var in dataset.data_vars for var in rope.keys()): raise ValueError("`rope` must be a subset of the variables in `data`") if not all(isinstance(v, tuple) and len(v) == 2 for v in rope.values()): raise ValueError("`rope` must be a dict of tuples of length 2") elif isinstance(rope, xr.Dataset): if rope_dim not in rope.dims: raise ValueError(f"{rope_dim} is not a dimension of `rope`") if (rope_len := rope.sizes[rope_dim]) != 2: raise ValueError(f"Length of {rope_dim} dim must be 2 but is {rope_len}") else: try: rope = np.array(rope) except ValueError as err: raise ValueError( "`rope` must be a dict, Dataset or array-like, failed to convert to array" ) from err if len(rope) != 2: raise ValueError("`rope` must be a tuple of length 2") if ci_kind == "eti": c_i = dataset.azstats.eti(prob=ci_prob, dim=sample_dims) else: c_i = dataset.azstats.hdi(prob=ci_prob, dim=sample_dims) ci_low = c_i.sel(ci_bound="lower") ci_high = c_i.sel(ci_bound="upper") in_ci = (dataset >= ci_low) & (dataset <= ci_high) rope = references_to_dataset(rope, dataset, sample_dims=sample_dims, ref_dim=rope_dim) ci_samples = dataset.where(in_ci) in_rope = (ci_samples >= rope.sel({rope_dim: 0})) & (ci_samples <= rope.sel({rope_dim: 1})) proportion = (in_rope.sum(dim=sample_dims) / in_ci.sum(dim=sample_dims)) * 100 return proportion
[docs] def mode( data, dim=None, group="posterior", var_names=None, filter_vars=None, coords=None, **kwargs, ): r"""Compute the mode. The mode is the value that appears most frequently in a data set. If the data is of type float, we assume it is continuous and use the half-sample method [1]_. If the data is of type int, we assume it is discrete and use :func:`numpy.unique` to find the most frequent value. Parameters ---------- data : array-like, DataArray, Dataset, DataTree, DataArrayGroupBy, DatasetGroupBy, or idata-like Input data. It will have different pre-processing applied to it depending on its type: - array-like: call array layer within ``arviz-stats``. - xarray object: apply dimension aware function to all relevant subsets - others: passed to :func:`arviz_base.convert_to_dataset` then treated as :class:`xarray.Dataset`. This option is discouraged due to needing this conversion which is completely automated and will be needed again in future executions or similar functions. It is recommended to first perform the conversion manually and then call ``arviz_stats.mode``. This allows controlling the conversion step and inspecting its results. dim : sequence of hashable, optional Dimensions over which to compute the mode. Defaults to ``rcParams["data.sample_dims"]``. group : hashable, default "posterior" Group on which to compute the mode var_names : str or list of str, optional Names of the variables for which the mode should be computed. filter_vars : {None, "like", "regex"}, default None coords : dict, optional Dictionary of dimension/index names to coordinate values defining a subset of the data for which to perform the computation. **kwargs : any, optional Forwarded to the array or dataarray interface for mode. Returns ------- ndarray, DataArray, Dataset, DataTree Requested mode of the provided input. See Also -------- :func:`xarray.Dataset.mean`, :func:`xarray.Dataset.median` References ---------- .. [1] Bickel DR, Fruehwirth R. *On a Fast, Robust Estimator of the Mode: Comparisons to Other Robust Estimators with Applications.* Computational Statistics & Data Analysis. 2006. https://doi.org/10.1016/j.csda.2005.07.011 arXiv preprint https://doi.org/10.48550/arXiv.math/0505419 Examples -------- Calculate the mode of a Normal random variable: .. ipython:: In [1]: import arviz_stats as azs ...: import numpy as np ...: data = np.random.default_rng().normal(size=2000) ...: azs.mode(data) Calculate the modes for specific variables: .. ipython:: In [1]: import arviz_base as azb ...: dt = azb.load_arviz_data("centered_eight") ...: azs.mode(dt, var_names=["mu", "theta"]) Calculate the modes excluding the school dimension: .. ipython:: In [1]: azs.mode(dt, dim=["chain", "draw"]) """ return _apply_multi_input_function( "mode", data, dim, "dim", group=group, var_names=var_names, filter_vars=filter_vars, coords=coords, **kwargs, )