Source code for climpred.classes

import warnings
from copy import deepcopy

import cf_xarray
import numpy as np
import xarray as xr
from dask import is_dask_collection
from IPython.display import display_html
from xarray.core.formatting_html import dataset_repr
from xarray.core.options import OPTIONS as XR_OPTIONS
from xarray.core.utils import Frozen

from .alignment import return_inits_and_verif_dates
from .bias_removal import bias_correction, gaussian_bias_removal, xclim_sdba
from .bootstrap import (
    bootstrap_hindcast,
    bootstrap_perfect_model,
    bootstrap_uninit_pm_ensemble_from_control_cftime,
)
from .checks import (
    _check_valid_reference,
    _check_valud_alignment,
    attach_long_names,
    attach_standard_names,
    has_dataset,
    has_dims,
    has_valid_lead_units,
    is_xarray,
    match_calendars,
    match_initialized_dims,
    match_initialized_vars,
    rename_to_climpred_dims,
)
from .constants import (
    BIAS_CORRECTION_BIAS_CORRECTION_METHODS,
    BIAS_CORRECTION_TRAIN_TEST_SPLIT_METHODS,
    CLIMPRED_DIMS,
    CONCAT_KWARGS,
    CROSS_VALIDATE_METHODS,
    INTERNAL_BIAS_CORRECTION_METHODS,
    M2M_MEMBER_DIM,
    XCLIM_BIAS_CORRECTION_METHODS,
)
from .exceptions import DimensionError, VariableError
from .graphics import plot_ensemble_perfect_model, plot_lead_timeseries_hindcast
from .logging import log_compute_hindcast_header
from .options import OPTIONS
from .prediction import (
    _apply_metric_at_given_lead,
    _get_metric_comparison_dim,
    compute_perfect_model,
)
from .reference import compute_climatology, compute_persistence
from .smoothing import (
    _reset_temporal_axis,
    smooth_goddard_2013,
    spatial_smoothing_xesmf,
    temporal_smoothing,
)
from .utils import (
    broadcast_metric_kwargs_for_rps,
    convert_time_index,
    convert_Timedelta_to_lead_units,
)


def _display_metadata(self):
    """
    This is called in the following case:

    ```
    dp = cp.HindcastEnsemble(dple)
    print(dp)
    ```
    """
    SPACE = "    "
    header = f"<climpred.{type(self).__name__}>"
    summary = header + "\nInitialized Ensemble:\n"
    summary += SPACE + str(self._datasets["initialized"].data_vars)[18:].strip() + "\n"
    if isinstance(self, HindcastEnsemble):
        # Prints out observations and associated variables if they exist.
        # If not, just write "None".
        summary += "Observations:\n"
        if any(self._datasets["observations"]):
            num_obs = len(self._datasets["observations"].data_vars)
            for i in range(1, num_obs + 1):
                summary += (
                    SPACE
                    + str(self._datasets["observations"].data_vars)
                    .split("\n")[i]
                    .strip()
                    + "\n"
                )
        else:
            summary += SPACE + "None\n"
    elif isinstance(self, PerfectModelEnsemble):
        summary += "Control:\n"
        # Prints out control variables if a control is appended. If not,
        # just write "None".
        if any(self._datasets["control"]):
            num_ctrl = len(self._datasets["control"].data_vars)
            for i in range(1, num_ctrl + 1):
                summary += (
                    SPACE
                    + str(self._datasets["control"].data_vars).split("\n")[i].strip()
                    + "\n"
                )
        else:
            summary += SPACE + "None\n"
    if any(self._datasets["uninitialized"]):
        summary += "Uninitialized:\n"
        summary += SPACE + str(self._datasets["uninitialized"].data_vars)[18:].strip()
    else:
        summary += "Uninitialized:\n"
        summary += SPACE + "None"
    return summary


def _display_metadata_html(self):
    header = f"<h4>climpred.{type(self).__name__}</h4>"
    display_html(header, raw=True)
    init_repr_str = dataset_repr(self._datasets["initialized"])
    init_repr_str = init_repr_str.replace("xarray.Dataset", "Initialized Ensemble")
    display_html(init_repr_str, raw=True)

    if isinstance(self, HindcastEnsemble):
        if any(self._datasets["observations"]):
            obs_repr_str = dataset_repr(self._datasets["observations"])
            obs_repr_str = obs_repr_str.replace("xarray.Dataset", "Observations")
            display_html(obs_repr_str, raw=True)
    elif isinstance(self, PerfectModelEnsemble):
        if any(self._datasets["control"]):
            control_repr_str = dataset_repr(self._datasets["control"])
            control_repr_str = control_repr_str.replace(
                "xarray.Dataset", "Control Simulation"
            )
            display_html(control_repr_str, raw=True)

    if any(self._datasets["uninitialized"]):
        uninit_repr_str = dataset_repr(self._datasets["uninitialized"])
        uninit_repr_str = uninit_repr_str.replace("xarray.Dataset", "Uninitialized")
        display_html(uninit_repr_str, raw=True)
    # better would be to aggregate repr_strs and then all return but this fails
    # TypeError: __repr__ returned non-string (type NoneType)
    # workaround return empty string
    return ""


class PredictionEnsemble:
    """
    The main object. This is the super of both `PerfectModelEnsemble` and
    `HindcastEnsemble`. This cannot be called directly by a user, but
    should house functions that both ensemble types can use.
    """

    @is_xarray(1)
    def __init__(self, xobj):
        if isinstance(xobj, xr.DataArray):
            # makes applying prediction functions easier, etc.
            xobj = xobj.to_dataset()
        xobj = rename_to_climpred_dims(xobj)
        has_dims(xobj, ["init", "lead"], "PredictionEnsemble")
        # Check that init is int, cftime, or datetime; convert ints or cftime to
        # datetime.
        xobj = convert_time_index(xobj, "init", "xobj[init]")
        # Put this after `convert_time_index` since it assigns 'years' attribute if the
        # `init` dimension is a `float` or `int`.
        xobj = convert_Timedelta_to_lead_units(xobj)
        has_valid_lead_units(xobj)
        # add metadata
        xobj = attach_standard_names(xobj)
        xobj = attach_long_names(xobj)
        xobj = xobj.cf.add_canonical_attributes(
            verbose=False, override=True, skip="units"
        )
        del xobj.attrs["history"]
        # Add initialized dictionary and reserve sub-dictionary for an uninitialized
        # run.
        self._datasets = {"initialized": xobj, "uninitialized": {}}
        self.kind = "prediction"
        self._temporally_smoothed = None
        self._is_annual_lead = None
        self._warn_if_chunked_along_init_member_time()

    @property
    def coords(self):
        """Dictionary of xarray.DataArray objects corresponding to coordinate
        variables available in all PredictionEnsemble._datasets.
        """
        pe_coords = self.get_initialized().coords.to_dataset()
        for ds in self._datasets.values():
            if isinstance(ds, xr.Dataset):
                pe_coords.update(ds.coords.to_dataset())
        return pe_coords.coords

    @property
    def nbytes(self) -> int:
        """Bytes sizes of all PredictionEnsemble._datasets."""
        return sum(
            [
                sum(v.nbytes for v in ds.variables.values())
                for ds in self._datasets.values()
                if isinstance(ds, xr.Dataset)
            ]
        )

    @property
    def sizes(self):
        """Mapping from dimension names to lengths for all PredictionEnsemble._datasets."""
        pe_dims = dict(self.get_initialized().dims)
        for ds in self._datasets.values():
            if isinstance(ds, xr.Dataset):
                pe_dims.update(dict(ds.dims))
        return pe_dims

    @property
    def dims(self):
        """Mapping from dimension names to lengths all PredictionEnsemble._datasets."""
        return Frozen(self.sizes)

    @property
    def chunks(self):
        """Mapping from chunks all PredictionEnsemble._datasets."""
        pe_chunks = dict(self.get_initialized().chunks)
        for ds in self._datasets.values():
            if isinstance(ds, xr.Dataset):
                for d in ds.chunks:
                    if d not in pe_chunks:
                        pe_chunks.update({d: ds.chunks[d]})
        return Frozen(pe_chunks)

    @property
    def data_vars(self):
        """Dictionary of DataArray objects corresponding to data variables available in all PredictionEnsemble._datasets."""
        varset = set(self.get_initialized().data_vars)
        for ds in self._datasets.values():
            if isinstance(ds, xr.Dataset):
                # take union
                varset = varset & set(ds.data_vars)
        varlist = list(varset)
        return self.get_initialized()[varlist].data_vars

    # when you just print it interactively
    # https://stackoverflow.com/questions/1535327/how-to-print-objects-of-class-using-print
    def __repr__(self):
        if XR_OPTIONS["display_style"] == "html":
            return _display_metadata_html(self)
        else:
            return _display_metadata(self)

    def __len__(self):
        """Number of all variables in all PredictionEnsemble._datasets."""
        return len(self.data_vars)

    def __iter__(self):
        """Iterate over underlying xr.Datasets for initialized, uninitialized, observations."""
        return iter(self._datasets.values())

    def __delitem__(self, key):
        """Remove a variable from this PredictionEnsemble."""
        del self._datasets["initialized"][key]
        for ds in self._datasets.values():
            if isinstance(ds, xr.Dataset):
                if key in ds.data_vars:
                    del ds[key]

    def __contains__(self, key):
        """The 'in' operator will return true or false depending on whether
        'key' is an array in all PredictionEnsemble._datasets or not.
        """
        contained = True
        for ds in self._datasets.values():
            if isinstance(ds, xr.Dataset):
                if key not in ds.data_vars:
                    contained = False
        return contained

    def equals(self, other):
        """Two PredictionEnsembles are equal if they have matching variables and
        coordinates, all of which are equal.
        PredictionEnsembles can still be equal (like pandas objects) if they have NaN
        values in the same locations.
        This method is necessary because `v1 == v2` for ``PredictionEnsembles``
        does element-wise comparisons (like numpy.ndarrays)."""
        if not isinstance(other, PredictionEnsemble):
            return False
        if other.kind != self.kind:
            return False
        equal = True
        try:
            for ds_name in self._datasets.keys():
                if isinstance(self._datasets[ds_name], xr.Dataset):
                    if not self._datasets[ds_name].equals(other._datasets[ds_name]):
                        equal = False
        except Exception:
            return False
        return equal

    def identical(self, other):
        """Like equals, but also checks all dataset attributes and the
        attributes on all variables and coordinates."""
        if not isinstance(other, PredictionEnsemble):
            return False
        if other.kind != self.kind:
            return False
        id = True
        try:
            for ds_name in self._datasets.keys():
                if not self._datasets[ds_name].identical(other._datasets[ds_name]):
                    id = False
        except Exception:
            return False
        return id

[docs] def plot(self, variable=None, ax=None, show_members=False, cmap=None): """Plot datasets from PredictionEnsemble. Args: variable (str or None): `variable` to show. Defaults to first in data_vars. ax (plt.axes): Axis to use in plotting. By default, creates a new axis. show_members (bool): whether to display all members individually. Defaults to False. cmap (str): Name of matplotlib-recognized colorbar. Defaults to `jet` for `HindcastEnsemble` and `tab10` for `PerfectModelEnsemble`. Returns: ax: plt.axes """ if self.kind == "hindcast": if cmap is None: cmap = "jet" return plot_lead_timeseries_hindcast( self, variable=variable, ax=ax, show_members=show_members, cmap=cmap ) elif self.kind == "perfect": if cmap is None: cmap = "tab10" return plot_ensemble_perfect_model( self, variable=variable, ax=ax, show_members=show_members, cmap=cmap )
def _math(self, other, operator): """Helper function for __add__, __sub__, __mul__, __truediv__. Allows math operations with type: - int - float - np.ndarray - xr.DataArray without new dimensions - xr.Dataset without new dimensions or variables """ assert isinstance(operator, str) def add(a, b): return a + b def sub(a, b): return a - b def mul(a, b): return a * b def div(a, b): return a / b ALLOWED_TYPES_FOR_MATH_OPERATORS = [ int, float, np.ndarray, xr.DataArray, xr.Dataset, type(self), ] OPERATOR_STR = { "add": "+", "sub": "-", "mul": "*", "div": "/", } error_str = f"Cannot use {type(self)} {OPERATOR_STR[operator]} {type(other)}" # catch undefined types for other if not isinstance(other, tuple(ALLOWED_TYPES_FOR_MATH_OPERATORS)): raise TypeError( f"{error_str} because type {type(other)} not supported. " f"Please choose from {ALLOWED_TYPES_FOR_MATH_OPERATORS}." ) # catch other dimensions in other if isinstance(other, tuple([xr.Dataset, xr.DataArray])): if not set(other.dims).issubset(self._datasets["initialized"].dims): raise DimensionError(f"{error_str} containing new dimensions.") # catch xr.Dataset with different data_vars if isinstance(other, xr.Dataset): if list(other.data_vars) != list(self._datasets["initialized"].data_vars): raise VariableError( f"{error_str} with new `data_vars`. Please use {type(self)} " f"{operator} {type(other)} only with same `data_vars`. Found " f"initialized.data_vars = " f' {list(self._datasets["initialized"].data_vars)} vs. ' f"other.data_vars = {list(other.data_vars)}." ) operator = eval(operator) if isinstance(other, PredictionEnsemble): # Create temporary copy to modify to avoid inplace operation. datasets = self._datasets.copy() for dataset in datasets: # Some pre-allocated entries might be empty, such as 'uninitialized' if isinstance(other._datasets[dataset], xr.Dataset) and isinstance( self._datasets[dataset], xr.Dataset ): datasets[dataset] = operator( datasets[dataset], other._datasets[dataset] ) return self._construct_direct(datasets, kind=self.kind) else: return self._apply_func(operator, other) def __add__(self, other): return self._math(other, operator="add") def __sub__(self, other): return self._math(other, operator="sub") def __mul__(self, other): return self._math(other, operator="mul") def __truediv__(self, other): return self._math(other, operator="div") def __getitem__(self, varlist): """Allows subsetting data variable from PredictionEnsemble as from xr.Dataset. Args: * varlist (list of str, str): list of names or name of data variable(s) to subselect """ if isinstance(varlist, str): varlist = [varlist] if not isinstance(varlist, list): raise ValueError( "Please subset PredictionEnsemble as you would subset an xr.Dataset " "with a list or single string of variable name(s), found " f"{type(varlist)}." ) def sel_vars(ds, varlist): return ds[varlist] return self._apply_func(sel_vars, varlist) def __getattr__(self, name): """Allows for xarray methods to be applied to our prediction objects. Args: * name: Function, e.g., .isel() or .sum(). """ def wrapper(*args, **kwargs): """Applies arbitrary function to all datasets in the PredictionEnsemble object. Got this from: https://stackoverflow.com/questions/41919499/ how-to-call-undefined-methods-sequentially-in-python-class """ def _apply_xr_func(v, name, *args, **kwargs): """Handles exceptions in our dictionary comprehension. In other words, this will skip applying the arbitrary function to a sub-dataset if a ValueError is thrown. This specifically targets cases where certain datasets don't have the given dim that's being called. E.g., ``.isel(lead=0)`` should only be applied to the initialized dataset. Reference: * https://stackoverflow.com/questions/1528237/ how-to-handle-exceptions-in-a-list-comprehensions """ try: return getattr(v, name)(*args, **kwargs) # ValueError : Cases such as .sum(dim='time'). This doesn't apply # it to the given dataset if the dimension doesn't exist. # KeyError : Cases where a function calls the index of a Dataset. Such # as ds[dim] and the dim doesn't exist as a key. # DimensionError: This accounts for our custom error when applying # some stats functions. except (ValueError, KeyError, DimensionError) as e: if args == tuple(): func_name = False else: if callable(args[0]): func_name = args[0].__name__ else: # for xarray calls like pe.mean() func_name = False dim = kwargs.get("dim", False) error_type = type(e).__name__ if func_name: if len(args) > 1: msg = f"{func_name}({args[1:]}, {kwargs}) failed\n{error_type}: {e}" else: msg = f"{func_name}({kwargs}) failed\n{error_type}: {e}" else: msg = f"xr.{name}({args}, {kwargs}) failed\n{error_type}: {e}" if set(["lead", "init"]).issubset(set(v.dims)): # initialized if dim not in v.dims: if OPTIONS["warn_for_failed_PredictionEnsemble_xr_call"]: warnings.warn(f"Error due to initialized: {msg}") elif set(["time"]).issubset( set(v.dims) ): # uninitialized, control, verification if dim not in v.dims: if OPTIONS["warn_for_failed_PredictionEnsemble_xr_call"]: warnings.warn( f"Error due to verification/control/uninitialized: {msg}" ) else: if OPTIONS["warn_for_failed_PredictionEnsemble_xr_call"]: warnings.warn(msg) return v return self._apply_func(_apply_xr_func, name, *args, **kwargs) return wrapper @classmethod def _construct_direct(cls, datasets, kind): """Shortcut around __init__ for internal use to avoid inplace operations. Pulled from xarrray Dataset class. https://github.com/pydata/xarray/blob/master/xarray/core/dataset.py """ obj = object.__new__(cls) obj._datasets = datasets obj.kind = kind obj._warn_if_chunked_along_init_member_time() return obj def _apply_func(self, func, *args, **kwargs): """Apply a function to all datasets in a `PredictionEnsemble`.""" # Create temporary copy to modify to avoid inplace operation. datasets = self._datasets.copy() # More explicit than nested dictionary comprehension. for key, ds in datasets.items(): # If ds is xr.Dataset, apply the function directly to it. else, e.g. for {} ignore if isinstance(ds, xr.Dataset): dim = kwargs.get("dim", "") if "_or_" in dim: dims = dim.split("_or_") if set(dims).issubset(ds.dims): raise ValueError( f"{dims} cannot be both in {key} dataset, found {ds.dims}" ) kwargs_dim0 = kwargs.copy() kwargs_dim0["dim"] = dims[0] kwargs_dim1 = kwargs.copy() kwargs_dim1["dim"] = dims[1] if dims[0] in ds.dims and dims[1] not in ds.dims: datasets.update({key: func(ds, *args, **kwargs_dim0)}) if dims[1] in ds.dims and dims[0] not in ds.dims: datasets.update({key: func(ds, *args, **kwargs_dim1)}) else: datasets.update({key: func(ds, *args, **kwargs)}) # Instantiates new object with the modified datasets. return self._construct_direct(datasets, kind=self.kind) def get_initialized(self): """Returns the xarray dataset for the initialized ensemble.""" return self._datasets["initialized"] def get_uninitialized(self): """Returns the xarray dataset for the uninitialized ensemble.""" return self._datasets["uninitialized"] def smooth(self, smooth_kws=None, how="mean", **xesmf_kwargs): """Smooth all entries of PredictionEnsemble in the same manner to be able to still calculate prediction skill afterwards. Args: smooth_kws (dict or str): Dictionary to specify the dims to smooth compatible with :py:func:`~climpred.smoothing.spatial_smoothing_xesmf` or :py:func:`~climpred.smoothing.temporal_smoothing`. Shortcut for Goddard et al. 2013 recommendations: 'goddard2013'. Defaults to None. how (str): how to smooth temporally. From ['mean','sum']. Defaults to 'mean'. **xesmf_kwargs (args): kwargs passed to :py:func:`~climpred.smoothing.spatial_smoothing_xesmf` Examples: >>> PerfectModelEnsemble.get_initialized().lead.size 20 >>> PerfectModelEnsemble.smooth({'lead':4}, how='sum').get_initialized().lead.size 17 >>> HindcastEnsemble_3D.smooth({'lon':1, 'lat':1}) <climpred.HindcastEnsemble> Initialized Ensemble: SST (init, lead, lat, lon) float64 -0.3236 -0.3161 -0.3083 ... 0.0 0.0 Observations: SST (time, lat, lon) float64 0.002937 0.001561 0.002587 ... 0.0 0.0 0.0 Uninitialized: None ``smooth`` simultaneously aggregates spatially listening to ``lon`` and ``lat`` and temporally listening to ``lead`` or ``time``. >>> HindcastEnsemble_3D.smooth({'lead': 2, 'lat': 5, 'lon': 4}).get_initialized().coords Coordinates: * init (init) object 1954-01-01 00:00:00 ... 2017-01-01 00:00:00 * lead (lead) int32 1 2 3 4 5 6 7 8 9 * lon (lon) float64 250.8 254.8 258.8 262.8 * lat (lat) float64 -9.75 -4.75 >>> HindcastEnsemble_3D.smooth('goddard2013').get_initialized().coords Coordinates: * init (init) object 1954-01-01 00:00:00 ... 2017-01-01 00:00:00 * lead (lead) int32 1 2 3 4 5 6 7 * lon (lon) float64 250.8 255.8 260.8 265.8 * lat (lat) float64 -9.75 -4.75 """ if not smooth_kws: return self # get proper smoothing function based on smooth args if isinstance(smooth_kws, str): if "goddard" in smooth_kws: if self._is_annual_lead: smooth_fct = smooth_goddard_2013 tsmooth_kws = {"lead": 4} # default d_lon_lat_kws = {"lon": 5, "lat": 5} # default else: raise ValueError( "`goddard2013` smoothing only available for annual leads." ) else: raise ValueError( 'Please provide from list of available smoothings: \ ["goddard2013"]' ) # TODO: actively searches for lot and lat in dims. Maybe this part of the code # could be more robust in how it finds these two spatial dimensions regardless # of name. Optional work in progress comment. elif isinstance(smooth_kws, dict): non_time_dims = [ dim for dim in smooth_kws.keys() if dim not in ["time", "lead"] ] if len(non_time_dims) > 0: non_time_dims = non_time_dims[0] # goddard when time_dim and lon/lat given if ("lon" in smooth_kws or "lat" in smooth_kws) and ( "lead" in smooth_kws or "time" in smooth_kws ): smooth_fct = smooth_goddard_2013 # separate lon, lat keywords into d_lon_lat_kws d_lon_lat_kws = dict() tsmooth_kws = dict() for c in ["lon", "lat"]: if c in smooth_kws: d_lon_lat_kws[c] = smooth_kws[c] for c in ["lead", "time"]: if c in smooth_kws: tsmooth_kws[c] = smooth_kws[c] # else only one smoothing operation elif "lon" in smooth_kws or "lat" in smooth_kws: smooth_fct = spatial_smoothing_xesmf d_lon_lat_kws = smooth_kws tsmooth_kws = None elif "lead" in smooth_kws or "time" in smooth_kws: smooth_fct = temporal_smoothing d_lon_lat_kws = None tsmooth_kws = smooth_kws else: raise ValueError( 'Please provide kwargs to fulfill functions: \ ["spatial_smoothing_xesmf", "temporal_smoothing"].' ) else: raise ValueError( "Please provide kwargs as dict or str and not", type(smooth_kws) ) self = self.map( smooth_fct, tsmooth_kws=tsmooth_kws, d_lon_lat_kws=d_lon_lat_kws, how=how, **xesmf_kwargs, ) if smooth_fct == smooth_goddard_2013 or smooth_fct == temporal_smoothing: self._temporally_smoothed = tsmooth_kws return self def _warn_if_chunked_along_init_member_time(self): """Warn upon instantiation when CLIMPRED_DIMS except ``lead`` are chunked with more than one chunk to show how to circumvent ``xskillscore`` chunking ``ValueError``.""" suggest_one_chunk = [] for d in self.chunks: if d in ["time", "init", "member"]: if len(self.chunks[d]) > 1: suggest_one_chunk.append(d) if len(suggest_one_chunk) > 0: name = ( str(type(self)) .replace("<class 'climpred.classes.", "") .replace("'>", "") ) # init cannot be dim when time chunked suggest_one_chunk_time_to_init = suggest_one_chunk.copy() if "time" in suggest_one_chunk_time_to_init: suggest_one_chunk_time_to_init.remove("time") suggest_one_chunk_time_to_init.append("init") msg = f"{name} is chunked along dimensions {suggest_one_chunk} with more than one chunk. `{name}.chunks={self.chunks}`.\nYou cannot call `{name}.verify` or `{name}.bootstrap` in combination with any of {suggest_one_chunk_time_to_init} passed as `dim`. In order to do so, please rechunk {suggest_one_chunk} with `{name}.chunk({{dim:-1}}).verify(dim=dim).`\nIf you do not want to use dimensions {suggest_one_chunk_time_to_init} in `{name}.verify(dim=dim)`, you can disregard this warning." # chunk lead:1 in HindcastEnsemble if self.kind == "hindcast": msg += '\nIn `HindcastEnsemble`s you may also create one chunk per lead, as the `climpred` internally loops over lead, e.g. `.chunk({{"lead": 1}}).verify().`' # chunk auto on non-climpred dims ndims = list(self.sizes) for d in CLIMPRED_DIMS: if d in ndims: ndims.remove(d) if len(ndims) > 0: msg += f'\nConsider chunking embarassingly parallel dimensions such as {ndims} automatically, i.e. `{name}.chunk({ndims[0]}="auto").verify(...).' warnings.warn(msg)
[docs]class PerfectModelEnsemble(PredictionEnsemble): """An object for "perfect model" climate prediction ensembles. `PerfectModelEnsemble` is a sub-class of `PredictionEnsemble`. It tracks the control run used to initialize the ensemble for easy computations, bootstrapping, etc. This object is built on `xarray` and thus requires the input object to be an `xarray` Dataset or DataArray. """
[docs] def __init__(self, xobj): """Create a `PerfectModelEnsemble` object by inputting output from the control run in `xarray` format. Args: xobj (xarray object): decadal prediction ensemble output. Attributes: control: Dictionary of control run associated with the initialized ensemble. uninitialized: Dictionary of uninitialized run that is bootstrapped from the initialized run. """ super().__init__(xobj) # Reserve sub-dictionary for the control simulation. self._datasets.update({"control": {}}) self.kind = "perfect"
def _apply_climpred_function(self, func, input_dict=None, **kwargs): """Helper function to loop through observations and apply an arbitrary climpred function. Args: func (function): climpred function to apply to object. input_dict (dict): dictionary with the following things: * ensemble: initialized or uninitialized ensemble. * control: control dictionary from HindcastEnsemble. * init (bool): True if the initialized ensemble, False if uninitialized. """ ensemble = input_dict["ensemble"] control = input_dict["control"] init = input_dict["init"] init_vars, ctrl_vars = self._vars_to_drop(init=init) ensemble = ensemble.drop_vars(init_vars) if control: control = control.drop_vars(ctrl_vars) return func(ensemble, control, **kwargs) def _vars_to_drop(self, init=True): """Returns list of variables to drop when comparing initialized/uninitialized to a control. This is useful if the two products being compared do not share the same variables. I.e., if the control has ['SST'] and the initialized has ['SST', 'SALT'], this will return a list with ['SALT'] to be dropped from the initialized. Args: init (bool, default True): If `True`, check variables on the initialized. If `False`, check variables on the uninitialized. Returns: Lists of variables to drop from the initialized/uninitialized and control Datasets. """ init_str = "initialized" if init else "uninitialized" init_vars = list(self._datasets[init_str]) # only drop if control present if self._datasets["control"]: ctrl_vars = list(self._datasets["control"]) # Make lists of variables to drop that aren't in common # with one another. init_vars_to_drop = list(set(init_vars) - set(ctrl_vars)) ctrl_vars_to_drop = list(set(ctrl_vars) - set(init_vars)) else: init_vars_to_drop, ctrl_vars_to_drop = [], [] return init_vars_to_drop, ctrl_vars_to_drop
[docs] @is_xarray(1) def add_control(self, xobj): """Add the control run that initialized the climate prediction ensemble. Args: xobj (xarray object): Dataset/DataArray of the control run. """ # NOTE: These should all be decorators. if isinstance(xobj, xr.DataArray): xobj = xobj.to_dataset() match_initialized_dims(self._datasets["initialized"], xobj) match_initialized_vars(self._datasets["initialized"], xobj) # Check that init is int, cftime, or datetime; convert ints or cftime to # datetime. xobj = convert_time_index(xobj, "time", "xobj[init]") # Check that converted/original cftime calendar is the same as the # initialized calendar to avoid any alignment errors. match_calendars(self._datasets["initialized"], xobj, kind2="control") datasets = self._datasets.copy() datasets.update({"control": xobj}) return self._construct_direct(datasets, kind="perfect")
[docs] def generate_uninitialized(self): """Generate an uninitialized ensemble by bootstrapping the initialized prediction ensemble. Returns: Bootstrapped (uninitialized) ensemble as a Dataset. """ has_dataset( self._datasets["control"], "control", "generate an uninitialized ensemble." ) uninit = bootstrap_uninit_pm_ensemble_from_control_cftime( self._datasets["initialized"], self._datasets["control"] ) datasets = self._datasets.copy() datasets.update({"uninitialized": uninit}) return self._construct_direct(datasets, kind="perfect")
[docs] def get_control(self): """Returns the control as an xarray dataset.""" return self._datasets["control"]
[docs] def verify( self, metric=None, comparison=None, dim=None, reference=None, **metric_kwargs, ): """Verify initialized predictions against a configuration of other ensemble members. .. note:: The configuration of the other ensemble members is based off of the ``comparison`` keyword argument. Args: metric (str, :py:class:`~climpred.metrics.Metric`): Metric to apply in the comparison. See `metrics </metrics.html>`_. comparison (str, :py:class:`~climpred.comparisons.Comparison`): How to compare the initialized prediction ensemble with itself, see `comparisons </comparisons.html>`_. dim (str, list of str): Dimension(s) over which to apply ``metric``. ``dim`` is passed on to xskillscore.{metric} and includes xskillscore's ``member_dim``. ``dim`` should contain ``member`` when ``comparison`` is probabilistic but should not contain ``member`` when ``comparison=e2c``. Defaults to ``None`` meaning that all dimensions other than ``lead`` are reduced. reference (str, list of str): Type of reference forecasts with which to verify. One or more of ['uninitialized', 'persistence', 'climatology']. **metric_kwargs (optional): Arguments passed to ``metric``. Returns: Dataset with dimension skill reduced by dim containing initialized and reference skill(s) if specified. Example: Root mean square error (``rmse``) comparing every member with the ensemble mean forecast (``m2e``) for all leads reducing dimensions ``init`` and ``member``: >>> PerfectModelEnsemble.verify(metric='rmse', comparison='m2e', ... dim=['init','member']) <xarray.Dataset> Dimensions: (lead: 20) Coordinates: * lead (lead) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Data variables: tos (lead) float32 0.1028 0.1249 0.1443 0.1707 ... 0.2113 0.2452 0.2297 Pearson's Anomaly Correlation ('acc') comparing every member to every other member (``m2m``) reducing dimensions ``member`` and ``init`` while also calculating reference skill for the ``persistence``, ``climatology`` and ``uninitialized`` forecast. >>> PerfectModelEnsemble.verify(metric='acc', comparison='m2m', ... dim=['init', 'member'], ... reference=['persistence', 'climatology' ,'uninitialized']) <xarray.Dataset> Dimensions: (skill: 4, lead: 20) Coordinates: * lead (lead) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 * skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized' Data variables: tos (skill, lead) float64 0.7941 0.7489 0.5623 ... 0.1327 0.4547 0.3253 """ reference = _check_valid_reference(reference) input_dict = { "ensemble": self._datasets["initialized"], "control": self._datasets["control"] if isinstance(self._datasets["control"], xr.Dataset) else None, "init": True, } result = self._apply_climpred_function( compute_perfect_model, input_dict=input_dict, metric=metric, comparison=comparison, dim=dim, add_attrs=False, **metric_kwargs, ) if self._temporally_smoothed: result = _reset_temporal_axis(result, self._temporally_smoothed, dim="lead") result["lead"].attrs = self.get_initialized().lead.attrs # compute reference skills if reference: for r in reference: dim_orig = deepcopy(dim) # preserve dim, because ref_compute_kwargs = metric_kwargs.copy() # persistence changes dim ref_compute_kwargs.update({"dim": dim_orig, "metric": metric}) if r != "persistence": ref_compute_kwargs["comparison"] = comparison ref = getattr(self, f"_compute_{r}")(**ref_compute_kwargs) result = xr.concat([result, ref], dim="skill", **CONCAT_KWARGS) result = result.assign_coords(skill=["initialized"] + reference) return result.squeeze()
def _compute_uninitialized( self, metric=None, comparison=None, dim=None, **metric_kwargs ): """Verify the bootstrapped uninitialized run against itself. .. note:: The configuration of the other ensemble members is based off of the ``comparison`` keyword argument. Args: metric (str, :py:class:`~climpred.metrics.Metric`): Metric to apply in the comparison. See `metrics </metrics.html>`_. comparison (str, :py:class:`~climpred.comparisons.Comparison`): How to compare the uninitialized against itself, see `comparisons </comparisons.html>`_. dim (str, list of str): Dimension(s) over which to apply metric. ``dim`` is passed on to xskillscore.{metric} and includes xskillscore's ``member_dim``. ``dim`` should contain ``member`` when ``comparison`` is probabilistic but should not contain ``member`` when ``comparison=e2c``. Defaults to ``None``, meaning that all dimensions other than ``lead`` are reduced. **metric_kwargs (optional): Arguments passed to ``metric``. Returns: Dataset with dimension skill containing initialized and reference skill(s). """ has_dataset( self._datasets["uninitialized"], "uninitialized", "compute an uninitialized metric", ) input_dict = { "ensemble": self._datasets["uninitialized"], "control": self._datasets["control"] if isinstance(self._datasets["control"], xr.Dataset) else None, "init": False, } if dim is None: dim = list(self._datasets["initialized"].isel(lead=0).dims) res = self._apply_climpred_function( compute_perfect_model, input_dict=input_dict, metric=metric, comparison=comparison, dim=dim, **metric_kwargs, ) if self._temporally_smoothed: res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead") res["lead"].attrs = self.get_initialized().lead.attrs return res def _compute_persistence(self, metric=None, dim=None, **metric_kwargs): """Verify a simple persistence forecast of the control run against itself. Args: metric (str, :py:class:`~climpred.metrics.Metric`): Metric to use when verifying skill of the persistence forecast. See `metrics </metrics.html>`_. dim (str, list of str): Dimension(s) over which to apply metric. ``dim`` is passed on to xskillscore.{metric} and includes xskillscore's ``member_dim``. ``dim`` should contain ``member`` when ``comparison`` is probabilistic but should not contain ``member`` when ``comparison=e2c``. Defaults to ``None``, meaning that all dimensions other than ``lead`` are reduced. **metric_kwargs (optional): Arguments passed to ``metric``. Returns: Dataset of persistence forecast results. Reference: * Chapter 8 (Short-Term Climate Prediction) in Van den Dool, Huug. Empirical methods in short-term climate prediction. Oxford University Press, 2007. """ has_dataset( self._datasets["control"], "control", "compute a persistence forecast" ) input_dict = { "ensemble": self._datasets["initialized"], "control": self._datasets["control"], "init": True, } if dim is None: dim = list(self._datasets["initialized"].isel(lead=0).dims) res = self._apply_climpred_function( compute_persistence, input_dict=input_dict, metric=metric, alignment="same_inits", dim=dim, **metric_kwargs, ) if self._temporally_smoothed: res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead") res["lead"].attrs = self.get_initialized().lead.attrs return res def _compute_climatology( self, metric=None, comparison=None, dim=None, **metric_kwargs ): """Verify a climatology forecast of the control run against itself. Args: metric (str, :py:class:`~climpred.metrics.Metric`): Metric to use when verifying skill of the persistence forecast. See `metrics </metrics.html>`_. dim (str, list of str): Dimension(s) over which to apply metric. ``dim`` is passed on to xskillscore.{metric} and includes xskillscore's ``member_dim``. ``dim`` should contain ``member`` when ``comparison`` is probabilistic but should not contain ``member`` when ``comparison=e2c``. Defaults to ``None``, meaning that all dimensions other than ``lead`` are reduced. **metric_kwargs (optional): Arguments passed to ``metric``. Returns: Dataset of persistence forecast results. Reference: * Chapter 8 (Short-Term Climate Prediction) in Van den Dool, Huug. Empirical methods in short-term climate prediction. Oxford University Press, 2007. """ input_dict = { "ensemble": self._datasets["initialized"], "control": self._datasets["control"] if isinstance(self._datasets["control"], xr.Dataset) else None, "init": True, } if dim is None: dim = list(self.get_initialized().isel(lead=0).dims) res = self._apply_climpred_function( compute_climatology, input_dict=input_dict, metric=metric, comparison=comparison, dim=dim, **metric_kwargs, ) if self._temporally_smoothed: res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead") res["lead"].attrs = self.get_initialized().lead.attrs return res
[docs] def bootstrap( self, metric=None, comparison=None, dim=None, reference=None, iterations=None, sig=95, pers_sig=None, **metric_kwargs, ): """Bootstrap with replacement according to Goddard et al. 2013. Args: metric (str, :py:class:`~climpred.metrics.Metric`): Metric to verify bootstrapped skill, see `metrics </metrics.html>`_. comparison (str, :py:class:`~climpred.comparisons.Comparison`): Comparison passed to verify, see `comparisons </comparisons.html>`_. dim (str, list of str): Dimension(s) over which to apply metric. ``dim`` is passed on to xskillscore.{metric} and includes xskillscore's ``member_dim``. ``dim`` should contain ``member`` when ``comparison`` is probabilistic but should not contain ``member`` when ``comparison=e2c``. Defaults to ``None`` meaning that all dimensions other than ``lead`` are reduced. reference (str, list of str): Type of reference forecasts with which to verify. One or more of ['uninitialized', 'persistence', 'climatology']. If None or empty, returns no p value. iterations (int): Number of resampling iterations for bootstrapping with replacement. Recommended >= 500. sig (int, default 95): Significance level in percent for deciding whether uninitialized and persistence beat initialized skill. pers_sig (int): If not ``None``, the separate significance level for persistence. Defaults to ``None``, or the same significance as ``sig``. **metric_kwargs (optional): arguments passed to ``metric``. Returns: xr.Datasets: with dimensions ``results`` (holding ``verify skill``, ``p``, ``low_ci`` and ``high_ci``) and ``skill`` (holding ``initialized``, ``persistence`` and/or ``uninitialized``): * results='verify skill', skill='initialized': mean initialized skill * results='high_ci', skill='initialized': high confidence interval boundary for initialized skill * results='p', skill='uninitialized': p value of the hypothesis that the difference of skill between the initialized and uninitialized simulations is smaller or equal to zero based on bootstrapping with replacement. * results='p', skill='persistence': p value of the hypothesis that the difference of skill between the initialized and persistenceistence simulations is smaller or equal to zero based on bootstrapping with replacement. Reference: * Goddard, L., A. Kumar, A. Solomon, D. Smith, G. Boer, P. Gonzalez, V. Kharin, et al. “A Verification Framework for Interannual-to-Decadal Predictions Experiments.” Climate Dynamics 40, no. 1–2 (January 1, 2013): 245–72. https://doi.org/10/f4jjvf. Example: Calculate the Pearson's Anomaly Correlation ('acc') comparing every member to every other member (``m2m``) reducing dimensions ``member`` and ``init`` 50 times after resampling ``member`` dimension with replacement. Also calculate reference skill for the ``persistence``, ``climatology`` and ``uninitialized`` forecast and compare whether initialized skill is better than reference skill: Returns verify skill, probability that reference forecast performs better than initialized and the lower and upper bound of the resample. >>> PerfectModelEnsemble.bootstrap(metric='acc', comparison='m2m', ... dim=['init', 'member'], iterations=50, resample_dim='member', ... reference=['persistence', 'climatology' ,'uninitialized']) <xarray.Dataset> Dimensions: (skill: 4, results: 4, lead: 20) Coordinates: * lead (lead) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 * results (results) <U12 'verify skill' 'p' 'low_ci' 'high_ci' * skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized' Data variables: tos (skill, results, lead) float64 0.7941 0.7489 ... 0.1494 0.1466 Attributes: prediction_skill: calculated by climpred https://climpred.read... number_of_initializations: 12 number_of_members: 10 alignment: same_verifs metric: pearson_r comparison: m2m dim: ['init', 'member'] units: None confidence_interval_levels: 0.975-0.025 bootstrap_iterations: 50 p: probability that reference performs better t... """ if iterations is None: raise ValueError("Designate number of bootstrapping `iterations`.") reference = _check_valid_reference(reference) has_dataset(self._datasets["control"], "control", "iteration") input_dict = { "ensemble": self._datasets["initialized"], "control": self._datasets["control"], "init": True, } return self._apply_climpred_function( bootstrap_perfect_model, input_dict=input_dict, metric=metric, comparison=comparison, dim=dim, reference=reference, sig=sig, iterations=iterations, pers_sig=pers_sig, **metric_kwargs, )
[docs]class HindcastEnsemble(PredictionEnsemble): """An object for climate prediction ensembles initialized by a data-like product. `HindcastEnsemble` is a sub-class of `PredictionEnsemble`. It tracks a single verification dataset (i.e., observations) associated with the hindcast ensemble for easy computation across multiple variables. This object is built on `xarray` and thus requires the input object to be an `xarray` Dataset or DataArray. """
[docs] def __init__(self, xobj): """Create a `HindcastEnsemble` object by inputting output from a prediction ensemble in `xarray` format. Args: xobj (xarray object): decadal prediction ensemble output. Attributes: observations: Dictionary of verification data to associate with the decadal prediction ensemble. uninitialized: Dictionary of companion (or bootstrapped) uninitialized ensemble run. """ super().__init__(xobj) self._datasets.update({"observations": {}}) self.kind = "hindcast"
def _apply_climpred_function(self, func, init, **kwargs): """Helper function to loop through verification data and apply an arbitrary climpred function. Args: func (function): climpred function to apply to object. init (bool): Whether or not it's the initialized ensemble. """ hind = self._datasets["initialized"] verif = self._datasets["observations"] drop_init, drop_obs = self._vars_to_drop(init=init) return func(hind.drop_vars(drop_init), verif.drop_vars(drop_obs), **kwargs) def _vars_to_drop(self, init=True): """Returns list of variables to drop when comparing initialized/uninitialized to observations. This is useful if the two products being compared do not share the same variables. I.e., if the observations have ['SST'] and the initialized has ['SST', 'SALT'], this will return a list with ['SALT'] to be dropped from the initialized. Args: init (bool, default True): If ``True``, check variables on the initialized. If ``False``, check variables on the uninitialized. Returns: Lists of variables to drop from the initialized/uninitialized and observational Datasets. """ if init: init_vars = [var for var in self._datasets["initialized"].data_vars] else: init_vars = [var for var in self._datasets["uninitialized"].data_vars] obs_vars = [var for var in self._datasets["observations"].data_vars] # Make lists of variables to drop that aren't in common # with one another. init_vars_to_drop = list(set(init_vars) - set(obs_vars)) obs_vars_to_drop = list(set(obs_vars) - set(init_vars)) return init_vars_to_drop, obs_vars_to_drop
[docs] @is_xarray(1) def add_observations(self, xobj): """Add verification data against which to verify the initialized ensemble. Args: xobj (xarray object): Dataset/DataArray to append to the ``HindcastEnsemble`` object. """ if isinstance(xobj, xr.DataArray): xobj = xobj.to_dataset() match_initialized_dims(self._datasets["initialized"], xobj) match_initialized_vars(self._datasets["initialized"], xobj) # Check that time is int, cftime, or datetime; convert ints or cftime to # datetime. xobj = convert_time_index(xobj, "time", "xobj[init]") # Check that converted/original cftime calendar is the same as the # initialized calendar to avoid any alignment errors. match_calendars(self._datasets["initialized"], xobj) datasets = self._datasets.copy() datasets.update({"observations": xobj}) return self._construct_direct(datasets, kind="hindcast")
[docs] @is_xarray(1) def add_uninitialized(self, xobj): """Add a companion uninitialized ensemble for comparison to verification data. Args: xobj (xarray object): Dataset/DataArray of the uninitialzed ensemble. """ if isinstance(xobj, xr.DataArray): xobj = xobj.to_dataset() match_initialized_dims(self._datasets["initialized"], xobj, uninitialized=True) match_initialized_vars(self._datasets["initialized"], xobj) # Check that init is int, cftime, or datetime; convert ints or cftime to # datetime. xobj = convert_time_index(xobj, "time", "xobj[init]") # Check that converted/original cftime calendar is the same as the # initialized calendar to avoid any alignment errors. match_calendars(self._datasets["initialized"], xobj, kind2="uninitialized") datasets = self._datasets.copy() datasets.update({"uninitialized": xobj}) return self._construct_direct(datasets, kind="hindcast")
[docs] def get_observations(self): """Returns xarray Datasets of the observations/verification data. Returns: ``xarray`` Dataset of observations. """ return self._datasets["observations"]
[docs] def verify( self, reference=None, metric=None, comparison=None, dim=None, alignment=None, **metric_kwargs, ): """Verifies the initialized ensemble against observations. .. note:: This will automatically verify against all shared variables between the initialized ensemble and observations/verification data. Args: reference (str, list of str): Type of reference forecasts to also verify against the observations. Choose one or more of ['uninitialized', 'persistence', 'climatology']. Defaults to None. metric (str, :py:class:`~climpred.metrics.Metric`): Metric to apply for verification. see `metrics </metrics.html>`_. comparison (str, :py:class:`~climpred.comparisons.Comparison`): How to compare to the observations/verification data. See `comparisons </comparisons.html>`_. dim (str, list of str): Dimension(s) to apply metric over. ``dim`` is passed on to xskillscore.{metric} and includes xskillscore's ``member_dim``. ``dim`` should contain ``member`` when ``comparison`` is probabilistic but should not contain ``member`` when ``comparison=e2o``. Defaults to ``None`` meaning that all dimensions other than ``lead`` are reduced. alignment (str): which inits or verification times should be aligned? - 'maximize': maximize the degrees of freedom by slicing ``hind`` and ``verif`` to a common time frame at each lead. - 'same_inits': slice to a common init frame prior to computing metric. This philosophy follows the thought that each lead should be based on the same set of initializations. - 'same_verif': slice to a common/consistent verification time frame prior to computing metric. This philosophy follows the thought that each lead should be based on the same set of verification dates. **metric_kwargs (optional): arguments passed to ``metric``. Returns: Dataset with dimension skill reduced by dim containing initialized and reference skill(s) if specified. Example: Root mean square error (``rmse``) comparing every member with the verification (``m2o``) over the same verification time (``same_verifs``) for all leads reducing dimensions ``init`` and ``member``: >>> HindcastEnsemble.verify(metric='rmse', comparison='m2o', ... alignment='same_verifs', dim=['init','member']) <xarray.Dataset> Dimensions: (lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 skill <U11 'initialized' Data variables: SST (lead) float64 0.08516 0.09492 0.1041 ... 0.1525 0.1697 0.1785 Pearson's Anomaly Correlation ('acc') comparing the ensemble mean with the verification (``e2o``) over the same initializations (``same_inits``) for all leads reducing dimension ``init`` while also calculating reference skill for the ``persistence``, ``climatology`` and ``uninitialized`` forecast. >>> HindcastEnsemble.verify(metric='acc', comparison='e2o', ... alignment='same_inits', dim='init', ... reference=['persistence', 'climatology' ,'uninitialized']) <xarray.Dataset> Dimensions: (skill: 4, lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 * skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized' Data variables: SST (skill, lead) float64 0.9023 0.8807 0.8955 ... 0.9078 0.9128 0.9159 """ # Have to do checks here since this doesn't call `compute_hindcast` directly. # Will be refactored when `climpred` migrates to inheritance-based. if dim is None: viable_dims = list(self.get_initialized().isel(lead=0).dims) + [[]] raise ValueError( "Designate a dimension to reduce over when applying the " f"metric. Got {dim}. Choose one or more of {viable_dims}" ) if ("member" in dim) and comparison not in ["m2o", "m2r"]: raise ValueError( "Comparison must equal 'm2o' with dim='member'. " f"Got comparison {comparison}." ) reference = _check_valid_reference(reference) def _verify( hind, verif, hist, reference, metric, comparison, alignment, dim, **metric_kwargs, ): """Interior verify func to be passed to apply func.""" metric, comparison, dim = _get_metric_comparison_dim( hind, metric, comparison, dim, kind=self.kind ) forecast, verif = comparison.function(hind, verif, metric=metric) if metric.name == "rps": # modify metric_kwargs for rps metric_kwargs = broadcast_metric_kwargs_for_rps( forecast, verif, metric_kwargs ) forecast = forecast.rename({"init": "time"}) inits, verif_dates = return_inits_and_verif_dates( forecast, verif, alignment, reference=reference, hist=hist, ) metric_over_leads = [ _apply_metric_at_given_lead( verif, verif_dates, lead, hind=forecast, hist=hist, inits=inits, # Ensure apply metric function returns skill and not reference # results. reference=None, metric=metric, comparison=comparison, dim=dim, **metric_kwargs, ) for lead in forecast["lead"].data ] result = xr.concat(metric_over_leads, dim="lead", **CONCAT_KWARGS) result["lead"] = forecast["lead"] if reference is not None: if "member" in verif.dims: # if broadcasted before verif = verif.isel(member=0) for r in reference: metric_over_leads = [ _apply_metric_at_given_lead( verif, verif_dates, lead, hind=forecast, hist=hist, inits=inits, reference=r, metric=metric, comparison=comparison, dim=dim, **metric_kwargs, ) for lead in forecast["lead"].data ] ref = xr.concat(metric_over_leads, dim="lead", **CONCAT_KWARGS) ref["lead"] = forecast["lead"] # fix to get no member dim for uninitialized e2o skill #477 if ( r == "uninitialized" and comparison.name == "e2o" and "member" in ref.dims ): ref = ref.mean("member") result = xr.concat([result, ref], dim="skill", **CONCAT_KWARGS) # rename back to 'init' if "time" in result.dims: result = result.rename({"time": "init"}) # Add dimension/coordinate for different references. result = result.assign_coords(skill=["initialized"] + reference) return result.squeeze() has_dataset( self._datasets["observations"], "observational", "verify a forecast" ) if "uninitialized" in reference: has_dataset( self._datasets["uninitialized"], "uninitialized", "compute an uninitialized reference forecast", ) hist = self._datasets["uninitialized"] else: hist = None res = self._apply_climpred_function( _verify, init=True, metric=metric, comparison=comparison, alignment=alignment, dim=dim, hist=hist, reference=reference, **metric_kwargs, ) if self._temporally_smoothed: res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead") res["lead"].attrs = self.get_initialized().lead.attrs return res
[docs] def bootstrap( self, metric=None, comparison=None, dim=None, alignment=None, reference=None, iterations=None, sig=95, resample_dim="member", pers_sig=None, **metric_kwargs, ): """Bootstrap with replacement according to Goddard et al. 2013. Args: metric (str, :py:class:`~climpred.metrics.Metric`): Metric to apply for verification, see `metrics <metrics.html>`_. comparison (str, :py:class:`~climpred.comparisons.Comparison`): How to compare to the observations/verification data, see `comparisons </comparisons.html>`_. dim (str, list of str): dimension(s) to apply metric over. ``dim`` is passed on to xskillscore.{metric} and includes xskillscore's ``member_dim``. ``dim`` should contain ``member`` when ``comparison`` is probabilistic but should not contain ``member`` when ``comparison='e2o'``. Defaults to ``None`` meaning that all dimensions other than ``lead`` are reduced. reference (str, list of str): Type of reference forecasts with which to verify. One or more of ['uninitialized', 'persistence', 'climatology']. If None or empty, returns no p value. alignment (str): which inits or verification times should be aligned? - 'maximize': maximize the degrees of freedom by slicing ``init`` and ``verif`` to a common time frame at each lead. - 'same_inits': slice to a common init frame prior to computing metric. This philosophy follows the thought that each lead should be based on the same set of initializations. - 'same_verif': slice to a common/consistent verification time frame prior to computing metric. This philosophy follows the thought that each lead should be based on the same set of verification dates. iterations (int): Number of resampling iterations for bootstrapping with replacement. Recommended >= 500. sig (int, default 95): Significance level in percent for deciding whether uninitialized and persistence beat initialized skill. resample_dim (str or list): dimension to resample from. default: 'member'. - 'member': select a different set of members from hind - 'init': select a different set of initializations from hind pers_sig (int, default None): If not None, the separate significance level for persistence. **metric_kwargs (optional): arguments passed to ``metric``. Returns: xr.Datasets: with dimensions ``results`` (holding ``skill``, ``p``, ``low_ci`` and ``high_ci``) and ``skill`` (holding ``initialized``, ``persistence`` and/or ``uninitialized``): * results='verify skill', skill='initialized': mean initialized skill * results='high_ci', skill='initialized': high confidence interval boundary for initialized skill * results='p', skill='uninitialized': p value of the hypothesis that the difference of skill between the initialized and uninitialized simulations is smaller or equal to zero based on bootstrapping with replacement. * results='p', skill='persistence': p value of the hypothesis that the difference of skill between the initialized and persistence simulations is smaller or equal to zero based on bootstrapping with replacement. Example: Calculate the Pearson's Anomaly Correlation ('acc') comparing the ensemble mean forecast to the verification (``e2o``) over the same verification times (``same_verifs``) for all leads reducing dimensions ``init`` 50 times after resampling ``member`` dimension with replacement. Also calculate reference skill for the ``persistence``, ``climatology`` and ``uninitialized`` forecast and compare whether initialized skill is better than reference skill: Returns verify skill, probability that reference forecast performs better than initialized and the lower and upper bound of the resample. >>> HindcastEnsemble.bootstrap(metric='acc', comparison='e2o', ... dim='init', iterations=50, resample_dim='member', ... alignment='same_verifs', ... reference=['persistence', 'climatology' ,'uninitialized']) <xarray.Dataset> Dimensions: (skill: 4, results: 4, lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 * results (results) <U12 'verify skill' 'p' 'low_ci' 'high_ci' * skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized' Data variables: SST (skill, results, lead) float64 0.9313 0.9119 ... 0.8078 0.8078 Attributes: prediction_skill: calculated by climpred https://climpred.read... number_of_initializations: 61 number_of_members: 10 alignment: same_verifs metric: pearson_r comparison: e2o dim: ['init'] units: None confidence_interval_levels: 0.975-0.025 bootstrap_iterations: 50 p: probability that reference performs better t... """ if iterations is None: raise ValueError("Designate number of bootstrapping `iterations`.") # TODO: replace with more computationally efficient classes implementation reference = _check_valid_reference(reference) if "uninitialized" in reference and not isinstance( self.get_uninitialized(), xr.Dataset ): raise ValueError("reference uninitialized requires uninitialized.") return bootstrap_hindcast( self.get_initialized(), self.get_uninitialized() if isinstance(self.get_uninitialized(), xr.Dataset) else None, self.get_observations(), metric=metric, comparison=comparison, dim=dim, alignment=alignment, reference=reference, resample_dim=resample_dim, sig=sig, iterations=iterations, pers_sig=pers_sig, **metric_kwargs, )
[docs] def remove_bias( self, alignment=None, how="additive_mean", train_test_split="unfair", train_init=None, train_time=None, cv=False, **metric_kwargs, ): """Calculate and remove bias from :py:class:`~climpred.classes.HindcastEnsemble`. Bias is grouped by ``seasonality`` set via :py:class:`~climpred.options.set_options`. When wrapping xclim.sbda.adjustment use ``group`` instead. Args: alignment (str): which inits or verification times should be aligned? - 'maximize': maximize the degrees of freedom by slicing ``hind`` and ``verif`` to a common time frame at each lead. - 'same_inits': slice to a common init frame prior to computing metric. This philosophy follows the thought that each lead should be based on the same set of initializations. - 'same_verif': slice to a common/consistent verification time frame prior to computing metric. This philosophy follows the thought that each lead should be based on the same set of verification dates. how (str): what kind of bias removal to perform. Defaults to 'additive_mean'. Select from: - 'additive_mean': correcting the mean forecast additively - 'multiplicative_mean': correcting the mean forecast multiplicatively - 'multiplicative_std': correcting the standard deviation multiplicatively - 'modified_quantile': `Reference <https://www.sciencedirect.com/science/article/abs/pii/S0034425716302000?via%3Dihub>`_ - 'basic_quantile': `Reference <https://rmets.onlinelibrary.wiley.com/doi/pdf/10.1002/joc.2168>`_ - 'gamma_mapping': `Reference <https://www.hydrol-earth-syst-sci.net/21/2649/2017/>`_ - 'normal_mapping': `Reference <https://www.hydrol-earth-syst-sci.net/21/2649/2017/>`_ - 'EmpiricalQuantileMapping': `Reference <https://xclim.readthedocs.io/en/stable/sdba_api.html#xclim.sdba.adjustment.EmpiricalQuantileMapping>`_ - 'DetrendedQuantileMapping': `Reference <https://xclim.readthedocs.io/en/stable/sdba_api.html#xclim.sdba.adjustment.DetrendedQuantileMapping>`_ - 'PrincipalComponents': `Reference <https://xclim.readthedocs.io/en/stable/sdba_api.html#xclim.sdba.adjustment.PrincipalComponents>`_ - 'QuantileDeltaMapping': `Reference <https://xclim.readthedocs.io/en/stable/sdba_api.html#xclim.sdba.adjustment.QuantileDeltaMapping>`_ - 'Scaling': `Reference <https://xclim.readthedocs.io/en/stable/sdba_api.html#xclim.sdba.adjustment.Scaling>`_ - 'LOCI': `Reference <https://xclim.readthedocs.io/en/stable/sdba_api.html#xclim.sdba.adjustment.LOCI>`_ train_test_split (str): How to separate train period to calculate the bias and test period to apply bias correction to? For a detailed description, see `Risbey et al. 2021 <http://www.nature.com/articles/s41467-021-23771-z>`_: - `fair`: no overlap between `train` and `test` (recommended). Set either `train_init` or `train_time`. - `unfair`: completely overlapping `train` and `test` (climpred default). - `unfair-cv`: overlapping `train` and `test` except for current `init`, which is `left out <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#Leave-one-out_cross-validation>`_ (set `cv='LOO'`). train_init (xr.DataArray, slice): Define initializations for training when ``alignment='same_inits/maximize'``. train_time (xr.DataArray, slice): Define time for training when ``alignment='same_verif'``. cv (bool or str): Only relevant when `train_test_split='unfair-cv'`. Defaults to False. - True/'LOO': Calculate bias by `leaving given initialization out <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#Leave-one-out_cross-validation>`_ - False: include all initializations in the calculation of bias, which is much faster and but yields similar skill with a large N of initializations. **metric_kwargs (dict): passed to ``xclim.sdba`` (including ``group``) or ``XBias_Correction`` Returns: HindcastEnsemble: bias removed HindcastEnsemble. Example: Skill from raw model output without bias reduction: >>> HindcastEnsemble.verify(metric='rmse', comparison='e2o', ... alignment='maximize', dim='init') <xarray.Dataset> Dimensions: (lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 skill <U11 'initialized' Data variables: SST (lead) float64 0.08359 0.08141 0.08362 ... 0.1361 0.1552 0.1664 Note that this HindcastEnsemble is already bias reduced, therefore ``train_test_split='unfair'`` has hardly any effect. Use all initializations to calculate bias and verify skill: >>> HindcastEnsemble.remove_bias(alignment='maximize', ... how='additive_mean', test_train_split='unfair' ... ).verify(metric='rmse', comparison='e2o', alignment='maximize', ... dim='init') <xarray.Dataset> Dimensions: (lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 skill <U11 'initialized' Data variables: SST (lead) float64 0.08349 0.08039 0.07522 ... 0.07305 0.08107 0.08255 Separate initializations 1954 - 1980 to calculate bias. Note that this HindcastEnsemble is already bias reduced, therefore ``train_test_split='fair'`` worsens skill here. Generally, ``train_test_split='fair'`` is recommended to use for a fair comparison against real-time forecasts. >>> HindcastEnsemble.remove_bias(alignment='maximize', ... how='additive_mean', train_test_split='fair', ... train_init=slice('1954', '1980')).verify(metric='rmse', ... comparison='e2o', alignment='maximize', dim='init') <xarray.Dataset> Dimensions: (lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 skill <U11 'initialized' Data variables: SST (lead) float64 0.132 0.1085 0.08722 ... 0.08209 0.08969 0.08732 Wrapping methods ``how`` from `xclim <https://xclim.readthedocs.io/en/stable/sdba_api.html>`_ and providing ``group`` for ``groupby``: >>> HindcastEnsemble.remove_bias(alignment='same_init', group='init', ... how='DetrendedQuantileMapping', train_test_split='unfair', ... ).verify(metric='rmse', ... comparison='e2o', alignment='maximize', dim='init') <xarray.Dataset> Dimensions: (lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 skill <U11 'initialized' Data variables: SST (lead) float64 0.09823 0.09747 0.08235 ... 0.07742 0.08115 0.08326 Wrapping methods ``how`` from `bias_correction <https://github.com/pankajkarman/bias_correction/blob/master/bias_correction.py>`_: >>> HindcastEnsemble.remove_bias(alignment='same_init', ... how='modified_quantile', train_test_split='unfair', ... ).verify(metric='rmse', ... comparison='e2o', alignment='maximize', dim='init') <xarray.Dataset> Dimensions: (lead: 10) Coordinates: * lead (lead) int32 1 2 3 4 5 6 7 8 9 10 skill <U11 'initialized' Data variables: SST (lead) float64 0.07628 0.08293 0.08169 ... 0.1577 0.1821 0.2087 """ if train_test_split not in BIAS_CORRECTION_TRAIN_TEST_SPLIT_METHODS: raise NotImplementedError( f"train_test_split='{train_test_split}' not implemented. Please choose `train_test_split` from {BIAS_CORRECTION_TRAIN_TEST_SPLIT_METHODS}, see Risbey et al. 2021 http://www.nature.com/articles/s41467-021-23771-z for description and https://github.com/pangeo-data/climpred/issues/648 for implementation status." ) alignment = _check_valud_alignment(alignment) if train_test_split in ["fair"]: if ( (train_init is None) or not isinstance(train_init, (slice, xr.DataArray)) ) and (alignment in ["same_inits", "maximize"]): raise ValueError( f'When alignment="{alignment}", please provide `train_init` as xr.DataArray, e.g. `hindcast.coords["init"].slice(start, end)` or slice, e.g. `slice(start, end)`, got `train_init={train_init}`.' ) if ( (train_time is None) or not isinstance(train_time, (slice, xr.DataArray)) ) and (alignment in ["same_verif"]): raise ValueError( f'When alignment="{alignment}", please provide `train_time` as xr.DataArray, e.g. `hindcast.coords["time"].slice(start, end)` or slice, e.g. `slice(start, end)`, got `train_time={train_time}`' ) if isinstance(train_init, slice): train_init = self.coords["init"].sel(init=train_init) if isinstance(train_time, slice): train_time = self.coords["time"].sel(time=train_time) if how == "mean": how = "additive_mean" # backwards compatibility if how in ["additive_mean", "multiplicative_mean", "multiplicative_std"]: func = gaussian_bias_removal elif how in BIAS_CORRECTION_BIAS_CORRECTION_METHODS: func = bias_correction elif how in XCLIM_BIAS_CORRECTION_METHODS: func = xclim_sdba else: raise NotImplementedError( f"bias removal '{how}' is not implemented, please choose from {INTERNAL_BIAS_CORRECTION_METHODS+BIAS_CORRECTION_BIAS_CORRECTION_METHODS}." ) if train_test_split in ["unfair-cv"]: if cv not in [True, "LOO"]: raise ValueError( f"Please provide `cv='LOO'` when train_test_split='unfair-cv', found `cv='{cv}'`" ) else: cv = "LOO" # backward compatibility if cv not in CROSS_VALIDATE_METHODS: raise NotImplementedError( f"cross validation method {cv} not implemented. Please choose cv from {CROSS_VALIDATE_METHODS}." ) metric_kwargs["cv"] = cv self = func( self, alignment=alignment, how=how, train_test_split=train_test_split, train_init=train_init, train_time=train_time, **metric_kwargs, ) return self