Source code for climpred.bootstrap

import inspect

import numpy as np
import xarray as xr

from .checks import has_dims
from .constants import POSITIVELY_ORIENTED_METRICS, PROBABILISTIC_METRICS
from .prediction import compute_hindcast, compute_perfect_model, compute_persistence
from .stats import dpp, varweighted_mean_period
from .utils import assign_attrs


def _distribution_to_ci(ds, ci_low, ci_high, dim='bootstrap'):
    """Get confidence intervals from bootstrapped distribution.

    Needed for bootstrapping confidence intervals and p_values of a metric.

    Args:
        ds (xarray object): distribution.
        ci_low (float): low confidence interval.
        ci_high (float): high confidence interval.
        dim (str): dimension to apply xr.quantile to. Default: 'bootstrap'

    Returns:
        uninit_hind (xarray object): uninitialize hindcast with hind.coords.
    """
    # Loads into memory if a dask array.
    ds = ds.compute()
    ds_ci = ds.quantile(q=[ci_low, ci_high], dim=dim)
    return ds_ci


def _pvalue_from_distributions(simple_fct, init, metric='pearson_r'):
    """Get probability that skill of a simple forecast (e.g., persistence or
    uninitlaized skill) is larger than initialized skill.

    Needed for bootstrapping confidence intervals and p_values of a metric in
    the hindcast framework. Checks whether a simple forecast like persistence
    or uninitialized performs better than initialized forecast. Need to keep in
    mind the orientation of metric (whether larger values are better or worse
    than smaller ones.)

    Args:
        simple_fct (xarray object): persistence or uninit skill.
        init (xarray object): hindcast skill.
        metric (str): name of metric

    Returns:
        pv (xarray object): probability that simple forecast performs better
                            than initialized forecast.
    """
    pv = ((simple_fct - init) > 0).sum('bootstrap') / init.bootstrap.size
    if metric not in POSITIVELY_ORIENTED_METRICS:
        pv = 1 - pv
    return pv


[docs]def bootstrap_uninitialized_ensemble(hind, hist):
    """Resample uninitialized hindcast from historical members.

    Note:
        Needed for bootstrapping confidence intervals and p_values of a metric in
        the hindcast framework. Takes hind.lead.size timesteps from historical at
        same forcing and rearranges them into ensemble and member dimensions.

    Args:
        hind (xarray object): hindcast.
        hist (xarray object): historical uninitialized.

    Returns:
        uninit_hind (xarray object): uninitialize hindcast with hind.coords.
    """
    # find range for bootstrapping
    has_dims(hist, 'member', 'historical ensemble')

    first_init = max(hist.time.min().values, hind['init'].min().values)
    last_init = min(
        hist.time.max().values - hind['lead'].size, hind['init'].max().values
    )
    hind = hind.sel(init=slice(first_init, last_init))

    uninit_hind = []
    for init in hind.init.values:
        random_members = np.random.choice(hist.member.values, hind.member.size)
        # take random uninitialized members from hist at init forcing
        # (Goddard allows 5 year forcing range here)
        uninit_at_one_init_year = hist.sel(
            time=slice(init + 1, init + hind['lead'].size), member=random_members
        ).rename({'time': 'lead'})
        uninit_at_one_init_year['lead'] = np.arange(
            1, 1 + uninit_at_one_init_year['lead'].size
        )
        uninit_at_one_init_year['member'] = np.arange(1, 1 + len(random_members))
        uninit_hind.append(uninit_at_one_init_year)
    uninit_hind = xr.concat(uninit_hind, 'init')
    uninit_hind['init'] = hind['init'].values
    return uninit_hind


[docs]def bootstrap_uninit_pm_ensemble_from_control(ds, control):
    """
    Create a pseudo-ensemble from control run.

    Note:
        Needed for block bootstrapping confidence intervals of a metric in perfect
        model framework. Takes randomly segments of length of ensemble dataset from
        control and rearranges them into ensemble and member dimensions.

    Args:
        ds (xarray object): ensemble simulation.
        control (xarray object): control simulation.

    Returns:
        ds_e (xarray object): pseudo-ensemble generated from control run.
    """
    nens = ds.init.size
    nmember = ds.member.size
    length = ds.lead.size
    c_start = 0
    c_end = control['time'].size
    lead_time = ds['lead']

    def isel_years(control, year_s, length):
        new = control.isel(time=slice(year_s, year_s + length))
        new = new.rename({'time': 'lead'})
        new['lead'] = lead_time
        return new

    def create_pseudo_members(control):
        startlist = np.random.randint(c_start, c_end - length - 1, nmember)
        return xr.concat(
            (isel_years(control, start, length) for start in startlist), 'member'
        )

    return xr.concat((create_pseudo_members(control) for _ in range(nens)), 'init')


def _bootstrap_func(
    func, ds, resample_dim, sig=95, bootstrap=500, *func_args, **func_kwargs
):
    """Sig% threshold of function based on resampling with replacement.

    Reference:
    * Mason, S. J., and G. M. Mimmack. “The Use of Bootstrap Confidence
     Intervals for the Correlation Coefficient in Climatology.” Theoretical and
      Applied Climatology 45, no. 4 (December 1, 1992): 229–33.
      https://doi.org/10/b6fnsv.

    Args:
        func (function): function to be bootstrapped.
        ds (xr.object): first input argument of func.
        resample_dim (str): dimension to resample from.
        sig (int,float,list): significance levels to return. Defaults to 95.
        bootstrap (int): number of resample iterations. Defaults to 500.
        *func_args (type): `*func_args`.
        **func_kwargs (type): `**func_kwargs`.

    Returns:
        sig_level: bootstrapped significance levels with
                   dimensions of ds and len(sig) if sig is list
    """
    if isinstance(sig, list):
        psig = [i / 100 for i in sig]
    else:
        psig = sig / 100
    bootstraped_results = []
    resample_dim_values = ds[resample_dim].values
    for _ in range(bootstrap):
        smp_resample_dim = np.random.choice(
            resample_dim_values, len(resample_dim_values)
        )
        smp_ds = ds.sel({resample_dim: smp_resample_dim})
        smp_ds[resample_dim] = resample_dim_values
        bootstraped_results.append(func(smp_ds, *func_args, **func_kwargs))
    sig_level = xr.concat(bootstraped_results, 'bootstrap').quantile(psig, 'bootstrap')
    return sig_level


[docs]def dpp_threshold(control, sig=95, bootstrap=500, dim='time', **dpp_kwargs):
    """Calc DPP significance levels from re-sampled dataset.

    Reference:
        * Feng, X., T. DelSole, and P. Houser. “Bootstrap Estimated Seasonal
          Potential Predictability of Global Temperature and Precipitation.”
          Geophysical Research Letters 38, no. 7 (2011).
          https://doi.org/10/ft272w.

    See also:
        * climpred.bootstrap._bootstrap_func
        * climpred.stats.dpp
    """
    return _bootstrap_func(
        dpp, control, dim, sig=sig, bootstrap=bootstrap, **dpp_kwargs
    )


[docs]def varweighted_mean_period_threshold(control, sig=95, bootstrap=500, time_dim='time'):
    """Calc the variance-weighted mean period significance levels from re-sampled dataset.

    See also:
        * climpred.bootstrap._bootstrap_func
        * climpred.stats.varweighted_mean_period
    """
    return _bootstrap_func(
        varweighted_mean_period, control, time_dim, sig=sig, bootstrap=bootstrap
    )


[docs]def bootstrap_compute(
    hind,
    reference,
    hist=None,
    metric='pearson_r',
    comparison='m2e',
    dim='init',
    sig=95,
    bootstrap=500,
    pers_sig=None,
    compute=compute_hindcast,
    resample_uninit=bootstrap_uninitialized_ensemble,
    **metric_kwargs,
):
    """Bootstrap compute with replacement.

    Args:
        hind (xr.Dataset): prediction ensemble.
        reference (xr.Dataset): reference simulation.
        hist (xr.Dataset): historical/uninitialized simulation.
        metric (str): `metric`. Defaults to 'pearson_r'.
        comparison (str): `comparison`. Defaults to 'm2e'.
        dim (str or list): dimension to apply metric over. default: 'init'
        sig (int): Significance level for uninitialized and
                   initialized skill. Defaults to 95.
        pers_sig (int): Significance level for persistence skill confidence levels.
                        Defaults to sig.
        bootstrap (int): number of resampling iterations (bootstrap
                         with replacement). Defaults to 500.
        compute (func): function to compute skill.
                        Choose from
                        [:py:func:`climpred.prediction.compute_perfect_model`,
                         :py:func:`climpred.prediction.compute_hindcast`].
        resample_uninit (func): function to create an uninitialized ensemble
                        from a control simulation or uninitialized large
                        ensemble. Choose from:
                        [:py:func:`bootstrap_uninitialized_ensemble`,
                         :py:func:`bootstrap_uninit_pm_ensemble_from_control`].
        ** metric_kwargs (dict): additional keywords to be passed to metric
            (see the arguments required for a given metric in :ref:`Metrics`).

    Returns:
        results: (xr.Dataset): bootstrapped results
            * init_ci (xr.Dataset): confidence levels of init_skill
            * uninit_ci (xr.Dataset): confidence levels of uninit_skill
            * p_uninit_over_init (xr.Dataset): p-value of the hypothesis
                                               that the difference of
                                               skill between the
                                               initialized and uninitialized
                                               simulations is smaller or
                                               equal to zero based on
                                               bootstrapping with
                                               replacement.
                                               Defaults to None.
            * pers_ci (xr.Dataset): confidence levels of pers_skill
            * p_pers_over_init (xr.Dataset): p-value of the hypothesis
                                             that the difference of
                                             skill between the
                                             initialized and persistence
                                             simulations is smaller or
                                             equal to zero based on
                                             bootstrapping with
                                             replacement.
                                             Defaults to None.

    Reference:
        * Goddard, L., A. Kumar, A. Solomon, D. Smith, G. Boer, P.
          Gonzalez, V. Kharin, et al. “A Verification Framework for
          Interannual-to-Decadal Predictions Experiments.” Climate
          Dynamics 40, no. 1–2 (January 1, 2013): 245–72.
          https://doi.org/10/f4jjvf.

    See also:
        * climpred.bootstrap.bootstrap_hindcast
        * climpred.bootstrap.bootstrap_perfect_model
    """
    if pers_sig is None:
        pers_sig = sig

    p = (100 - sig) / 100
    ci_low = p / 2
    ci_high = 1 - p / 2
    p_pers = (100 - pers_sig) / 100
    ci_low_pers = p_pers / 2
    ci_high_pers = 1 - p_pers / 2

    init = []
    uninit = []
    pers = []

    # which dim should be resampled: member or init
    if dim == 'member' and 'member' in hind.dims:
        members = hind.member.values
        to_be_shuffled = members
        shuffle_dim = 'member'
    elif 'init' in dim and 'init' in hind.dims:
        # also allows ['init','member']
        inits = hind.init.values
        to_be_shuffled = inits
        shuffle_dim = 'init'
    else:
        raise ValueError('Shuffle either `member` or `init`; not', dim)
    # resample with replacement
    # DoTo: parallelize loop
    for _ in range(bootstrap):
        smp = np.random.choice(to_be_shuffled, len(to_be_shuffled))
        smp_hind = hind.sel({shuffle_dim: smp})
        if shuffle_dim == 'member':
            smp_hind['member'] = np.arange(1, 1 + smp_hind.member.size)
        # compute init skill
        init_skill = compute(
            smp_hind,
            reference,
            metric=metric,
            comparison=comparison,
            add_attrs=False,
            dim=dim,
            **metric_kwargs,
        )
        # reset inits when probabilistic, otherwise tests fail
        if (
            shuffle_dim == 'init'
            and metric in PROBABILISTIC_METRICS
            and 'init' in init_skill.coords
        ):
            init_skill['init'] = inits
        init.append(init_skill)
        # generate uninitialized ensemble from hist
        if hist is None:  # PM path, use reference = control
            hist = reference
        uninit_hind = resample_uninit(hind, hist)
        # compute uninit skill
        uninit.append(
            compute(
                uninit_hind,
                reference,
                metric=metric,
                comparison=comparison,
                dim=dim,
                add_attrs=False,
                **metric_kwargs,
            )
        )
        # compute persistence skill
        # impossible for probabilistic
        if metric not in PROBABILISTIC_METRICS:
            pers.append(
                compute_persistence(smp_hind, reference, metric=metric, **metric_kwargs)
            )
    init = xr.concat(init, dim='bootstrap')
    # remove useless member = 0 coords after m2c
    if 'member' in init.coords and init.member.size == 1:
        if init.member.size == 1:
            del init['member']
    uninit = xr.concat(uninit, dim='bootstrap')
    # when persistence is not computed set flag
    if pers != []:
        pers = xr.concat(pers, dim='bootstrap')
        pers_output = True
    else:
        pers_output = False

    # get confidence intervals CI
    init_ci = _distribution_to_ci(init, ci_low, ci_high)
    uninit_ci = _distribution_to_ci(uninit, ci_low, ci_high)
    # probabilistic metrics wont have persistence forecast
    # therefore only get CI if persistence was computed
    if pers_output:
        if set(pers.coords) != set(init.coords):
            init, pers = xr.broadcast(init, pers)
        pers_ci = _distribution_to_ci(pers, ci_low_pers, ci_high_pers)
    else:
        # otherwise set all persistence outputs to false
        pers = init.isnull()
        pers_ci = init_ci == -999

    # pvalue whether uninit or pers better than init forecast
    p_uninit_over_init = _pvalue_from_distributions(uninit, init, metric=metric)
    p_pers_over_init = _pvalue_from_distributions(pers, init, metric)

    # calc mean skill without any resampling
    init_skill = compute(
        hind, reference, metric=metric, comparison=comparison, dim=dim, **metric_kwargs
    )
    if 'init' in init_skill:
        init_skill = init_skill.mean('init')
    # remove useless member = 0 coords after m2c
    if 'member' in init_skill.coords and init_skill.member.size == 1:
        del init_skill['member']
    # uninit skill as mean resampled uninit skill
    uninit_skill = uninit.mean('bootstrap')
    if metric not in PROBABILISTIC_METRICS:
        pers_skill = compute_persistence(
            hind, reference, metric=metric, **metric_kwargs
        )
    else:
        pers_skill = init_skill.isnull()
    # align to prepare for concat
    if set(pers_skill.coords) != set(init_skill.coords):
        init_skill, pers_skill = xr.broadcast(init_skill, pers_skill)

    # wrap results together in one dataarray
    skill = xr.concat([init_skill, uninit_skill, pers_skill], 'kind')
    skill['kind'] = ['init', 'uninit', 'pers']

    # probability that i beats init
    p = xr.concat([p_uninit_over_init, p_pers_over_init], 'kind')
    p['kind'] = ['uninit', 'pers']

    # ci for each skill
    ci = xr.concat([init_ci, uninit_ci, pers_ci], 'kind').rename(
        {'quantile': 'results'}
    )
    ci['kind'] = ['init', 'uninit', 'pers']

    results = xr.concat([skill, p], 'results')
    results['results'] = ['skill', 'p']
    if set(results.coords) != set(ci.coords):
        res_drop = [c for c in results.coords if c not in ci.coords]
        ci_drop = [c for c in ci.coords if c not in results.coords]
        results = results.drop(res_drop)
        ci = ci.drop(ci_drop)
    results = xr.concat([results, ci], 'results')
    results['results'] = ['skill', 'p', 'low_ci', 'high_ci']
    # Attach climpred compute information to skill
    metadata_dict = {
        'confidence_interval_levels': f'{ci_high}-{ci_low}',
        'bootstrap_iterations': bootstrap,
        'p': 'probability that initialized forecast performs \
                          better than reference forecast',
    }
    metadata_dict.update(metric_kwargs)
    results = assign_attrs(
        results,
        hind,
        metric=metric,
        comparison=comparison,
        function_name=inspect.stack()[0][3],  # take function.__name__
        metadata_dict=metadata_dict,
    )
    return results


[docs]def bootstrap_hindcast(
    hind,
    hist,
    reference,
    metric='pearson_r',
    comparison='e2r',
    dim='init',
    sig=95,
    bootstrap=500,
    pers_sig=None,
    **metric_kwargs,
):
    """Bootstrap compute with replacement. Wrapper of
     py:func:`bootstrap_compute` for hindcasts.

    Args:
        hind (xr.Dataset): prediction ensemble.
        reference (xr.Dataset): reference simulation.
        hist (xr.Dataset): historical/uninitialized simulation.
        metric (str): `metric`. Defaults to 'pearson_r'.
        comparison (str): `comparison`. Defaults to 'e2r'.
        dim (str): dimension to apply metric over. default: 'init'
        sig (int): Significance level for uninitialized and
                   initialized skill. Defaults to 95.
        pers_sig (int): Significance level for persistence skill confidence levels.
                        Defaults to sig.
        bootstrap (int): number of resampling iterations (bootstrap
                         with replacement). Defaults to 500.
        ** metric_kwargs (dict): additional keywords to be passed to metric
            (see the arguments required for a given metric in :ref:`Metrics`).

    Returns:
        results: (xr.Dataset): bootstrapped results
            * init_ci (xr.Dataset): confidence levels of init_skill
            * uninit_ci (xr.Dataset): confidence levels of uninit_skill
            * p_uninit_over_init (xr.Dataset): p-value of the hypothesis
                                               that the difference of
                                               skill between the
                                               initialized and uninitialized
                                               simulations is smaller or
                                               equal to zero based on
                                               bootstrapping with
                                               replacement.
                                               Defaults to None.
            * pers_ci (xr.Dataset): confidence levels of pers_skill
            * p_pers_over_init (xr.Dataset): p-value of the hypothesis
                                             that the difference of
                                             skill between the
                                             initialized and persistence
                                             simulations is smaller or
                                             equal to zero based on
                                             bootstrapping with
                                             replacement.
                                             Defaults to None.

    Reference:
        * Goddard, L., A. Kumar, A. Solomon, D. Smith, G. Boer, P.
          Gonzalez, V. Kharin, et al. “A Verification Framework for
          Interannual-to-Decadal Predictions Experiments.” Climate
          Dynamics 40, no. 1–2 (January 1, 2013): 245–72.
          https://doi.org/10/f4jjvf.

    See also:
        * climpred.bootstrap.bootstrap_compute
        * climpred.prediction.compute_hindcast
    """
    return bootstrap_compute(
        hind,
        reference,
        hist=hist,
        metric=metric,
        comparison=comparison,
        dim=dim,
        sig=sig,
        bootstrap=bootstrap,
        pers_sig=pers_sig,
        compute=compute_hindcast,
        resample_uninit=bootstrap_uninitialized_ensemble,
        **metric_kwargs,
    )


[docs]def bootstrap_perfect_model(
    ds,
    control,
    metric='pearson_r',
    comparison='m2e',
    dim=None,
    sig=95,
    bootstrap=500,
    pers_sig=None,
    **metric_kwargs,
):
    """Bootstrap compute with replacement. Wrapper of
     py:func:`bootstrap_compute` for perfect-model framework.

    Args:
        hind (xr.Dataset): prediction ensemble.
        reference (xr.Dataset): reference simulation.
        hist (xr.Dataset): historical/uninitialized simulation.
        metric (str): `metric`. Defaults to 'pearson_r'.
        comparison (str): `comparison`. Defaults to 'm2e'.
        dim (str): dimension to apply metric over. default: ['init', 'member']
        sig (int): Significance level for uninitialized and
                   initialized skill. Defaults to 95.
        pers_sig (int): Significance level for persistence skill confidence levels.
                        Defaults to sig.
        bootstrap (int): number of resampling iterations (bootstrap
                         with replacement). Defaults to 500.
        ** metric_kwargs (dict): additional keywords to be passed to metric
            (see the arguments required for a given metric in :ref:`Metrics`).

    Returns:
        results: (xr.Dataset): bootstrapped results
            * init_ci (xr.Dataset): confidence levels of init_skill
            * uninit_ci (xr.Dataset): confidence levels of uninit_skill
            * p_uninit_over_init (xr.Dataset): p-value of the hypothesis
                                               that the difference of
                                               skill between the
                                               initialized and uninitialized
                                               simulations is smaller or
                                               equal to zero based on
                                               bootstrapping with
                                               replacement.
                                               Defaults to None.
            * pers_ci (xr.Dataset): confidence levels of pers_skill
            * p_pers_over_init (xr.Dataset): p-value of the hypothesis
                                             that the difference of
                                             skill between the
                                             initialized and persistence
                                             simulations is smaller or
                                             equal to zero based on
                                             bootstrapping with
                                             replacement.
                                             Defaults to None.

    Reference:
        * Goddard, L., A. Kumar, A. Solomon, D. Smith, G. Boer, P.
          Gonzalez, V. Kharin, et al. “A Verification Framework for
          Interannual-to-Decadal Predictions Experiments.” Climate
          Dynamics 40, no. 1–2 (January 1, 2013): 245–72.
          https://doi.org/10/f4jjvf.

    See also:
        * climpred.bootstrap.bootstrap_compute
        * climpred.prediction.compute_perfect_model
    """

    if dim is None:
        dim = ['init', 'member']
    return bootstrap_compute(
        ds,
        control,
        hist=None,
        metric=metric,
        comparison=comparison,
        dim=dim,
        sig=sig,
        bootstrap=bootstrap,
        pers_sig=pers_sig,
        compute=compute_perfect_model,
        resample_uninit=bootstrap_uninit_pm_ensemble_from_control,
        **metric_kwargs,
    )