"""Main module instantiating ``PerfectModelEnsemble`` and ``HindcastEnsemble."""
import logging
import warnings
from copy import deepcopy
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Hashable,
Iterator,
List,
Mapping,
Optional,
Tuple,
Union,
)
import cf_xarray # noqa
import numpy as np
import xarray as xr
from xarray.core.coordinates import DatasetCoordinates
from xarray.core.dataset import DataVariables
from xarray.core.formatting_html import dataset_repr
from xarray.core.options import OPTIONS as XR_OPTIONS
from xarray.core.utils import Frozen
from .alignment import return_inits_and_verif_dates
from .bias_removal import bias_correction, gaussian_bias_removal, xclim_sdba
from .bootstrap import (
_distribution_to_ci,
_p_ci_from_sig,
_pvalue_from_distributions,
bootstrap_uninit_pm_ensemble_from_control_cftime,
resample_skill_exclude_resample_dim_from_dim,
resample_skill_loop,
resample_skill_resample_before,
resample_uninitialized_from_initialized,
warn_if_chunking_would_increase_performance,
)
from .checks import (
_check_valid_alignment,
_check_valid_reference,
attach_long_names,
attach_standard_names,
has_dataset,
has_dims,
has_valid_lead_units,
match_calendars,
match_initialized_dims,
match_initialized_vars,
rename_to_climpred_dims,
)
from .comparisons import Comparison
from .constants import (
BIAS_CORRECTION_BIAS_CORRECTION_METHODS,
BIAS_CORRECTION_TRAIN_TEST_SPLIT_METHODS,
CLIMPRED_DIMS,
CONCAT_KWARGS,
CROSS_VALIDATE_METHODS,
INTERNAL_BIAS_CORRECTION_METHODS,
XCLIM_BIAS_CORRECTION_METHODS,
)
from .exceptions import CoordinateError, DimensionError, KeywordError, VariableError
from .metrics import PEARSON_R_CONTAINING_METRICS, Metric
from .options import OPTIONS, set_options
from .prediction import (
_apply_metric_at_given_lead,
_get_metric_comparison_dim,
_sanitize_to_list,
compute_perfect_model,
)
from .reference import (
compute_climatology,
compute_persistence,
compute_persistence_from_first_lead,
)
from .smoothing import (
_reset_temporal_axis,
smooth_goddard_2013,
spatial_smoothing_xesmf,
temporal_smoothing,
)
from .utils import (
add_time_from_init_lead,
assign_attrs,
broadcast_metric_kwargs_for_rps,
convert_time_index,
convert_Timedelta_to_lead_units,
)
metricType = Union[str, Metric]
comparisonType = Union[str, Comparison]
dimType = Optional[Union[str, List[str]]]
alignmentType = str
referenceType = Union[List[str], str]
groupbyType = Optional[Union[str, xr.DataArray]]
metric_kwargsType = Optional[Any]
if TYPE_CHECKING:
import matplotlib.pyplot as plt
optionalaxisType = Optional[plt.Axes]
else:
optionalaxisType = Optional[Any]
def _display_metadata(self) -> str:
"""
Print the contents of the :py:class:`.PredictionEnsemble` as text.
Example:
>>> init = climpred.tutorial.load_dataset("CESM-DP-SST")
>>> hindcast = climpred.HindcastEnsemble(init)
>>> print(hindcast)
<climpred.HindcastEnsemble>
Initialized:
SST (init, lead, member) float64 ...
Uninitialized:
None
Observations:
None
"""
SPACE = " "
summary = f"<climpred.{type(self).__name__}>\n"
for k in self._datasets.keys():
if self._datasets[k]:
summary += (
str(self._datasets[k].data_vars).replace(
"Data variables", k.capitalize()
)
+ "\n"
)
else:
summary += f"{k.capitalize()}:\n{SPACE}None\n"
return summary.strip("\n")
def _display_metadata_html(self):
"""Show contents of :py:class:`.PredictionEnsemble` as html."""
html_str = f"<h4>climpred.{type(self).__name__}</h4>"
for k in self._datasets.keys():
if self._datasets[k]:
html_str += dataset_repr(self._datasets[k]).replace(
"xarray.Dataset", k.capitalize()
)
return html_str
[docs]class PredictionEnsemble:
"""
The main object :py:class:`.PredictionEnsemble`.
This is the super of both :py:class:`.PerfectModelEnsemble` and
:py:class:`.HindcastEnsemble`. This cannot be called directly by
a user, but should house functions that both ensemble types can use.
Associated :py:class:`xarray.Dataset` are stored in:
* ``PredictionEnsemble._datasets["initialized"]``
* ``PredictionEnsemble._datasets["uninitialized"]``
* ``PredictionEnsemble._datasets["control"]`` in
:py:class:`.PerfectModelEnsemble`
* ``PredictionEnsemble._datasets[observations"]`` in
:py:class:`.HindcastEnsemble`
"""
[docs] def __init__(self, initialized: Union[xr.DataArray, xr.Dataset]):
"""Create a :py:class:`.PredictionEnsemble` object."""
if isinstance(initialized, xr.DataArray):
# makes applying prediction functions easier, etc.
initialized = initialized.to_dataset()
assert isinstance(
initialized, xr.Dataset
), "PredictionEnsemble.__init__ requires xr.DataArray or xr.Dataset"
initialized = rename_to_climpred_dims(initialized)
has_dims(initialized, ["init", "lead"], "PredictionEnsemble")
# Check that init is int, cftime, or datetime; convert ints or cftime to
# datetime.
initialized = convert_time_index(initialized, "init", "initialized[init]")
# Put this after `convert_time_index` since it assigns 'years' attribute if the
# `init` dimension is a `float` or `int`.
initialized = convert_Timedelta_to_lead_units(initialized)
has_valid_lead_units(initialized)
initialized = add_time_from_init_lead(initialized)
# add metadata
initialized = attach_standard_names(initialized)
initialized = attach_long_names(initialized)
initialized = initialized.cf.add_canonical_attributes(
verbose=False, override=True, skip="units"
)
del initialized.attrs["history"] # better only delete xclim message or not?
# Add initialized dictionary and reserve sub-dictionary for an uninitialized
# run.
self._datasets = {"initialized": initialized, "uninitialized": {}}
self.kind = "prediction"
self._temporally_smoothed: Optional[Dict[str, int]] = None
self._is_annual_lead = None
self._warn_if_chunked_along_init_member_time()
def _groupby(self, call: str, groupby: Union[str, xr.DataArray], **kwargs: Any):
"""Help for verify/bootstrap(groupby="month")."""
skill_group, group_label = [], []
groupby_str = f"init.{groupby}" if isinstance(groupby, str) else groupby
with set_options(warn_for_failed_PredictionEnsemble_xr_call=False):
for group, hind_group in self.get_initialized().init.groupby(groupby_str):
skill_group.append(
getattr(self.sel(init=hind_group), call)(
**kwargs,
)
)
group_label.append(group)
new_dim_name = groupby if isinstance(groupby, str) else groupby_str.name
skill_group = xr.concat(
skill_group, dim=new_dim_name, **CONCAT_KWARGS
).assign_coords({new_dim_name: group_label})
skill_group[new_dim_name] = skill_group[new_dim_name].assign_attrs( # type: ignore # noqa: E501
{
"description": "new dimension showing skill grouped by init.{groupby}"
" created by .verify(groupby) or .bootstrap(groupby)"
}
)
return skill_group
@property
def coords(self) -> DatasetCoordinates:
"""Return coordinates of :py:class:`.PredictionEnsemble`.
Dictionary of :py:class:`xarray.DataArray` objects corresponding to coordinate
variables available in all PredictionEnsemble._datasets.
See also:
:py:meth:`~xarray.Dataset.coords`
"""
pe_coords = self.get_initialized().coords.to_dataset()
for ds in self._datasets.values():
if isinstance(ds, xr.Dataset):
pe_coords.update(ds.coords.to_dataset())
return pe_coords.coords
@property
def nbytes(self) -> int:
"""Bytes sizes of all PredictionEnsemble._datasets.
See also:
:py:meth:`~xarray.Dataset.nbytes`
"""
return sum(
[
sum(v.nbytes for v in ds.variables.values())
for ds in self._datasets.values()
if isinstance(ds, xr.Dataset)
]
)
@property
def sizes(self) -> Mapping[Hashable, int]:
"""
Return sizes of :py:class:`.PredictionEnsemble`.
Mapping from dimension names to lengths for all PredictionEnsemble._datasets.
See also:
:py:meth:`~xarray.Dataset.equals`
"""
pe_dims = dict(self.get_initialized().dims)
for ds in self._datasets.values():
if isinstance(ds, xr.Dataset):
pe_dims.update(dict(ds.dims))
return pe_dims
@property
def dims(self) -> Mapping[Hashable, int]:
"""
Return dimension of :py:class:`.PredictionEnsemble`.
Mapping from dimension names to lengths all PredictionEnsemble._datasets.
See also:
:py:meth:`~xarray.Dataset.dims`
"""
return Frozen(self.sizes)
@property
def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]:
"""
Return chunks of :py:class:`.PredictionEnsemble`.
Mapping from chunks all PredictionEnsemble._datasets.
See also:
:py:meth:`~xarray.Dataset.chunks`
"""
pe_chunks = dict(self.get_initialized().chunks)
for ds in self._datasets.values():
if isinstance(ds, xr.Dataset):
for d in ds.chunks:
if d not in pe_chunks:
pe_chunks.update({d: ds.chunks[d]})
return Frozen(pe_chunks)
@property
def chunksizes(self) -> Mapping[Hashable, Tuple[int, ...]]:
"""Return chunksizes of :py:class:`.PredictionEnsemble`.
Mapping from dimension names to block lengths for this dataset's data, or
None if the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling ``.chunk()``.
Same as :py:meth:`~xarray.Dataset.chunks`.
See also:
:py:meth:`~xarray.Dataset.chunksizes`
"""
return self.chunks
@property
def data_vars(self) -> DataVariables:
"""
Return data variables of :py:class:`.PredictionEnsemble`.
Dictionary of DataArray objects corresponding to data variables available in
all PredictionEnsemble._datasets.
See also:
:py:meth:`~xarray.Dataset.data_vars`
"""
varset = set(self.get_initialized().data_vars)
for ds in self._datasets.values():
if isinstance(ds, xr.Dataset):
# take union
varset = varset & set(ds.data_vars)
varlist = list(varset)
return self.get_initialized()[varlist].data_vars
def _repr_html_(self):
"""Return for :py:class:`.PredictionEnsemble` in html."""
from html import escape
if XR_OPTIONS["display_style"] == "text":
return f"<pre>{escape(repr(self))}</pre>"
return _display_metadata_html(self)
def __repr__(self) -> str:
"""Return for print(:py:class:`.PredictionEnsemble`)."""
return _display_metadata(self)
[docs] def __len__(self) -> int:
"""Return number of all variables :py:class:`.PredictionEnsemble`."""
return len(self.data_vars)
[docs] def __iter__(self) -> Iterator[Hashable]:
"""Iterate over underlying :py:class:`xarray.Dataset`."""
return iter(self._datasets.values())
[docs] def __delitem__(self, key: Hashable) -> None:
"""Remove a variable from :py:class:`.PredictionEnsemble`."""
del self._datasets["initialized"][key]
for ds in self._datasets.values():
if isinstance(ds, xr.Dataset):
if key in ds.data_vars:
del ds[key]
[docs] def __contains__(self, key: Hashable) -> bool:
"""Check variable in :py:class:`.PredictionEnsemble`.
The ``"in"`` operator will return true or false depending on whether
``"key"`` is in any PredictionEnsemble._datasets.
"""
contained = True
for ds in self._datasets.values():
if isinstance(ds, xr.Dataset):
if key not in ds.data_vars:
contained = False
return contained
[docs] def equals(self, other: Union["PredictionEnsemble", Any]) -> bool:
"""Check if :py:class:`.PredictionEnsemble` is equal to other.
Two :py:class:`.PredictionEnsemble` are equal if they have
matching variables and coordinates, all of which are equal.
``PredictionEnsembles`` can still be equal (like pandas objects) if they have
NaN values in the same locations.
This method is necessary because `v1 == v2` for ``PredictionEnsembles``
does element-wise comparisons (like numpy.ndarrays).
See also:
:py:meth:`~xarray.Dataset.equals`
"""
if not isinstance(other, PredictionEnsemble):
return False
if other.kind != self.kind:
return False
equal = True
try:
for ds_name in self._datasets.keys():
if isinstance(self._datasets[ds_name], xr.Dataset):
if not self._datasets[ds_name].equals(other._datasets[ds_name]):
equal = False
except Exception:
return False
return equal
[docs] def identical(self, other: Union["PredictionEnsemble", Any]) -> bool:
"""
Check if :py:class:`.PredictionEnsemble` is identical to other.
Like ``equals``, but also checks all dataset attributes and the
attributes on all variables and coordinates.
See also:
:py:meth:`~xarray.Dataset.identical`
"""
if not isinstance(other, PredictionEnsemble):
return False
if other.kind != self.kind:
return False
id = True
try:
for ds_name in self._datasets.keys():
if not self._datasets[ds_name].identical(other._datasets[ds_name]):
id = False
except Exception:
return False
return id
[docs] def plot(
self,
variable: Optional[str] = None,
ax: optionalaxisType = None,
show_members: bool = False,
cmap: Optional[str] = None,
x: str = "time",
) -> "plt.Axes":
"""Plot datasets from :py:class:`.PredictionEnsemble`.
Wraps :py:func:`.climpred.graphics.plot_ensemble_perfect_model` or
:py:func:`.climpred.graphics.plot_lead_timeseries_hindcast`.
Args:
variable: `variable` to show. Defaults to first in data_vars.
ax: Axis to use in plotting. By default, creates a new axis.
show_members: whether to display all members individually.
Defaults to False.
cmap: Name of matplotlib-recognized colorbar. Defaults to ``viridis``
for :py:class:`.HindcastEnsemble`
and ``tab10`` for :py:class:`.PerfectModelEnsemble`.
x: Name of x-axis. Use ``time`` to show observations and
hindcasts in real time. Use ``init`` to see hindcasts as
initializations. For ``x=init`` only initialized is shown and only
works for :py:class:`.HindcastEnsemble`.
.. note::
Alternatively inspect initialized datasets by
``PredictionEnsemble.get_initialized()[v].plot.line(x=time)``
to see ``validtime`` on x-axis or
``PredictionEnsemble.get_initialized()[v].plot.line(x=init)``
to see ``init`` on x-axis.
Returns:
ax: plt.axes
"""
from .graphics import plot_ensemble_perfect_model, plot_lead_timeseries_hindcast
if x == "time":
x = "valid_time"
assert x in ["valid_time", "init"]
if isinstance(self, HindcastEnsemble):
if cmap is None:
cmap = "viridis"
return plot_lead_timeseries_hindcast(
self,
variable=variable,
ax=ax,
show_members=show_members,
cmap=cmap,
x=x,
)
elif isinstance(self, PerfectModelEnsemble):
if cmap is None:
cmap = "tab10"
return plot_ensemble_perfect_model(
self, variable=variable, ax=ax, show_members=show_members, cmap=cmap
)
mathType = Union[int, float, np.ndarray, xr.DataArray, xr.Dataset]
def _math(
self,
other: mathType,
operator: str,
):
"""Help function for __add__, __sub__, __mul__, __truediv__.
Allows math operations with type:
- int
- float
- np.ndarray
- xr.DataArray without new dimensions
- xr.Dataset without new dimensions or variables
"""
assert isinstance(operator, str)
def add(a, b):
return a + b
def sub(a, b):
return a - b
def mul(a, b):
return a * b
def div(a, b):
return a / b
ALLOWED_TYPES_FOR_MATH_OPERATORS = [
int,
float,
np.ndarray,
xr.DataArray,
xr.Dataset,
type(self),
]
OPERATOR_STR = {
"add": "+",
"sub": "-",
"mul": "*",
"div": "/",
}
error_str = f"Cannot use {type(self)} {OPERATOR_STR[operator]} {type(other)}"
# catch undefined types for other
if not isinstance(other, tuple(ALLOWED_TYPES_FOR_MATH_OPERATORS)):
raise TypeError(
f"{error_str} because type {type(other)} not supported. "
f"Please choose from {ALLOWED_TYPES_FOR_MATH_OPERATORS}."
)
# catch other dimensions in other
if isinstance(other, tuple([xr.Dataset, xr.DataArray])):
if not set(other.dims).issubset(self._datasets["initialized"].dims): # type: ignore # noqa: E501
raise DimensionError(f"{error_str} containing new dimensions.")
# catch xr.Dataset with different data_vars
if isinstance(other, xr.Dataset):
if list(other.data_vars) != list(self._datasets["initialized"].data_vars):
raise VariableError(
f"{error_str} with new `data_vars`. Please use {type(self)} "
f"{operator} {type(other)} only with same `data_vars`. Found "
f"initialized.data_vars = "
f' {list(self._datasets["initialized"].data_vars)} vs. '
f"other.data_vars = {list(other.data_vars)}."
)
_operator = eval(operator)
if isinstance(other, PredictionEnsemble):
# Create temporary copy to modify to avoid inplace operation.
datasets = self._datasets.copy()
for dataset in datasets:
# Some pre-allocated entries might be empty, such as 'uninitialized'
if isinstance(other._datasets[dataset], xr.Dataset) and isinstance(
self._datasets[dataset], xr.Dataset
):
datasets[dataset] = _operator(
datasets[dataset], other._datasets[dataset]
)
return self._construct_direct(datasets, kind=self.kind)
else:
return self._apply_func(_operator, other)
[docs] def __add__(self, other: mathType) -> "PredictionEnsemble":
"""Add."""
return self._math(other, operator="add")
[docs] def __sub__(self, other: mathType) -> "PredictionEnsemble":
"""Sub."""
return self._math(other, operator="sub")
[docs] def __mul__(self, other: mathType) -> "PredictionEnsemble":
"""Mul."""
return self._math(other, operator="mul")
[docs] def __truediv__(self, other: mathType) -> "PredictionEnsemble":
"""Div."""
return self._math(other, operator="div")
[docs] def __getitem__(self, varlist: Union[str, List[str]]) -> "PredictionEnsemble":
"""Allow subsetting variable(s) from
Allow subsetting variable(s) from
:py:class:`.PredictionEnsemble` as from
:py:class:`xarray.Dataset`.
Args:
varlist: list of names or name of data variable(s) to subselect
"""
if isinstance(varlist, str):
varlist = [varlist]
if not isinstance(varlist, list):
raise ValueError(
"Please subset PredictionEnsemble as you would subset an xr.Dataset "
"with a list or single string of variable name(s), found "
f"{type(varlist)}."
)
def sel_vars(ds, varlist):
return ds[varlist]
return self._apply_func(sel_vars, varlist)
[docs] def __getattr__(self, name): # -> Callable[[VarArg(Any), KwArg(Any)], Any]
"""Allow for ``xarray`` methods to be applied to our prediction objects.
Args:
* name: str of xarray function, e.g., ``.isel()`` or ``.sum()``.
"""
if name in [
"_ipython_canary_method_should_not_exist_",
"_ipython_display_",
"_repr_markdown_",
"_repr_svg_",
"_repr_png_",
"_repr_pdf_",
"_repr_jpeg_",
"_repr_latex_",
"_repr_json_",
"_repr_mimebundle_",
"_repr_javascript_",
]:
return None # typing: ignore
def wrapper(*args, **kwargs):
"""Apply arbitrary function to all datasets in ``PerfectModelEnsemble``.
Got this from: https://stackoverflow.com/questions/41919499/
how-to-call-undefined-methods-sequentially-in-python-class
"""
def _apply_xr_func(v, name, *args, **kwargs):
"""Handle exceptions in our dictionary comprehension.
In other words, this will skip applying the arbitrary function
to a sub-dataset if a ValueError is thrown. This specifically
targets cases where certain datasets don't have the given
dim that's being called. E.g., ``.isel(lead=0)`` should only
be applied to the initialized dataset.
References:
* https://stackoverflow.com/questions/1528237/
how-to-handle-exceptions-in-a-list-comprehensions
"""
try:
return getattr(v, name)(*args, **kwargs)
# ValueError : Cases such as .sum(dim='time'). This doesn't apply
# it to the given dataset if the dimension doesn't exist.
# KeyError : Cases where a function calls the index of a Dataset. Such
# as ds[dim] and the dim doesn't exist as a key.
# DimensionError: This accounts for our custom error when applying
# some stats functions.
except (ValueError, KeyError, DimensionError) as e:
if args == tuple():
func_name = False
else:
if callable(args[0]):
func_name = args[0].__name__
else: # for xarray calls like pe.mean()
func_name = False
dim = kwargs.get("dim", False)
error_type = type(e).__name__
if func_name:
if len(args) > 1:
msg = f"{func_name}({args[1:]}, {kwargs}) failed\n{error_type}: {e}" # noqa: E501
else:
msg = f"{func_name}({kwargs}) failed\n{error_type}: {e}"
else:
msg = f"xr.{name}({args}, {kwargs}) failed\n{error_type}: {e}"
if set(["lead", "init"]).issubset(set(v.dims)): # initialized
if dim not in v.dims:
if OPTIONS["warn_for_failed_PredictionEnsemble_xr_call"]:
warnings.warn(f"Error due to initialized: {msg}")
elif set(["time"]).issubset(
set(v.dims)
): # uninitialized, control, verification
if dim not in v.dims:
if OPTIONS["warn_for_failed_PredictionEnsemble_xr_call"]:
warnings.warn(
f"Error due to verification/control/uninitialized: {msg}" # noqa: E501
)
else:
if OPTIONS["warn_for_failed_PredictionEnsemble_xr_call"]:
warnings.warn(msg)
return v
return self._apply_func(_apply_xr_func, name, *args, **kwargs)
return wrapper
@classmethod
def _construct_direct(cls, datasets, kind):
"""Shortcut around __init__ for internal use to avoid inplace operations.
Pulled from xarrray Dataset class.
https://github.com/pydata/xarray/blob/master/xarray/core/dataset.py
"""
obj = object.__new__(cls)
obj._datasets = datasets
obj.kind = kind
obj._warn_if_chunked_along_init_member_time()
return obj
def _apply_func(
self, func: Callable[..., xr.Dataset], *args: Any, **kwargs: Any
) -> "PredictionEnsemble":
"""Apply a function to all datasets in a ``PerfectModelEnsemble``."""
# Create temporary copy to modify to avoid inplace operation.
# isnt that essentially the same as .map(func)?
datasets = self._datasets.copy()
# More explicit than nested dictionary comprehension.
for key, ds in datasets.items():
# If ds is xr.Dataset, apply the function directly to it
# else, e.g. for {} ignore
if isinstance(ds, xr.Dataset):
dim = kwargs.get("dim", "")
if "_or_" in dim:
dims = dim.split("_or_")
if set(dims).issubset(ds.dims):
raise ValueError(
f"{dims} cannot be both in {key} dataset, found {ds.dims}"
)
kwargs_dim0 = kwargs.copy()
kwargs_dim0["dim"] = dims[0]
kwargs_dim1 = kwargs.copy()
kwargs_dim1["dim"] = dims[1]
if dims[0] in ds.dims and dims[1] not in ds.dims:
datasets.update({key: func(ds, *args, **kwargs_dim0)})
if dims[1] in ds.dims and dims[0] not in ds.dims:
datasets.update({key: func(ds, *args, **kwargs_dim1)})
else:
datasets.update({key: func(ds, *args, **kwargs)})
# Instantiates new object with the modified datasets.
return self._construct_direct(datasets, kind=self.kind)
def get_initialized(self) -> xr.Dataset:
"""Return the :py:class:`xarray.Dataset` for the initialized ensemble."""
return self._datasets["initialized"]
def get_uninitialized(self) -> xr.Dataset:
"""Return the :py:class:`xarray.Dataset` for the uninitialized ensemble."""
return self._datasets["uninitialized"]
def smooth(
self,
smooth_kws: Optional[Union[str, Dict[str, int]]] = None,
how: str = "mean",
**xesmf_kwargs: str,
):
"""Smooth in space and/or aggregate in time in ``PredictionEnsemble``.
Args:
smooth_kws: Dictionary to specify the dims to
smooth compatible with
:py:func:`~climpred.smoothing.spatial_smoothing_xesmf` or
:py:func:`~climpred.smoothing.temporal_smoothing`.
Shortcut for :cite:t:`Goddard2013` ``goddard2013``.
Defaults to ``None``.
how: how to smooth temporally. From Choose from ``["mean", "sum"]``.
Defaults to ``"mean"``.
**xesmf_kwargs: kwargs passed to
:py:func:`~climpred.smoothing.spatial_smoothing_xesmf`
Examples:
>>> PerfectModelEnsemble.get_initialized().lead.size
20
>>> PerfectModelEnsemble.smooth(
... {"lead": 4}, how="sum"
... ).get_initialized().lead.size
17
>>> HindcastEnsemble_3D.smooth({"lon": 1, "lat": 1})
<climpred.HindcastEnsemble>
Initialized:
SST (init, lead, lat, lon) float32 -0.3236 -0.3161 -0.3083 ... 0.0 0.0
Uninitialized:
None
Observations:
SST (time, lat, lon) float32 0.002937 0.001561 0.002587 ... 0.0 0.0 0.0
``smooth`` simultaneously aggregates spatially listening to ``lon`` and
``lat`` and temporally listening to ``lead`` or ``time``.
>>> HindcastEnsemble_3D.smooth(
... {"lead": 2, "lat": 5, "lon": 4}
... ).get_initialized().coords
Coordinates:
* init (init) object 1954-01-01 00:00:00 ... 2017-01-01 00:00:00
* lead (lead) int32 1 2 3 4 5 6 7 8 9
* lat (lat) float64 -9.75 -4.75
* lon (lon) float64 250.8 254.8 258.8 262.8
valid_time (lead, init) object 1955-01-01 00:00:00 ... 2026-01-01 00:00:00
>>> HindcastEnsemble_3D.smooth("goddard2013").get_initialized().coords
Coordinates:
* init (init) object 1954-01-01 00:00:00 ... 2017-01-01 00:00:00
* lead (lead) int32 1 2 3 4 5 6 7
* lat (lat) float64 -9.75 -4.75
* lon (lon) float64 250.8 255.8 260.8 265.8
valid_time (lead, init) object 1955-01-01 00:00:00 ... 2024-01-01 00:00:00
"""
if not smooth_kws:
return self
tsmooth_kws: Optional[Union[str, Dict[str, int]]] = None
d_lon_lat_kws: Optional[Union[str, Dict[str, int]]] = None
smooth_fct: Callable[..., xr.Dataset]
# get proper smoothing function based on smooth args
if isinstance(smooth_kws, str):
if "goddard" in smooth_kws:
if self._is_annual_lead:
smooth_fct = smooth_goddard_2013
tsmooth_kws = {"lead": 4} # default
d_lon_lat_kws = {"lon": 5, "lat": 5} # default
else:
raise ValueError(
"`goddard2013` smoothing only available for annual leads."
)
else:
raise ValueError(
'Please provide from list of available smoothings: \
["goddard2013"]'
)
# TODO: actively searches for lot and lat in dims. Maybe this part of the code
# could be more robust in how it finds these two spatial dimensions regardless
# of name. Optional work in progress comment.
elif isinstance(smooth_kws, dict):
# goddard when time_dim and lon/lat given
if ("lon" in smooth_kws or "lat" in smooth_kws) and (
"lead" in smooth_kws or "time" in smooth_kws
):
smooth_fct = smooth_goddard_2013
# separate lon, lat keywords into d_lon_lat_kws
d_lon_lat_kws = dict()
tsmooth_kws = dict()
for c in ["lon", "lat"]:
if c in smooth_kws:
d_lon_lat_kws[c] = smooth_kws[c]
for c in ["lead", "time"]:
if c in smooth_kws:
tsmooth_kws[c] = smooth_kws[c]
# else only one smoothing operation
elif "lon" in smooth_kws or "lat" in smooth_kws:
smooth_fct = spatial_smoothing_xesmf
d_lon_lat_kws = smooth_kws
tsmooth_kws = None
elif "lead" in smooth_kws or "time" in smooth_kws:
smooth_fct = temporal_smoothing
d_lon_lat_kws = None
tsmooth_kws = smooth_kws
else:
raise ValueError(
'Please provide kwargs to fulfill functions: \
["spatial_smoothing_xesmf", "temporal_smoothing"].'
)
else:
raise ValueError(
"Please provide kwargs as dict or str and not", type(smooth_kws)
)
self = self.map(
smooth_fct,
tsmooth_kws=tsmooth_kws,
d_lon_lat_kws=d_lon_lat_kws,
how=how,
**xesmf_kwargs,
)
if smooth_fct == smooth_goddard_2013 or smooth_fct == temporal_smoothing:
self._temporally_smoothed = tsmooth_kws
# recalc valid_time
del self._datasets["initialized"].coords["valid_time"]
self._datasets["initialized"] = add_time_from_init_lead(
self._datasets["initialized"]
)
return self
def remove_seasonality(
self, seasonality: Union[None, str] = None
) -> "PredictionEnsemble":
"""Remove seasonal cycle from :py:class:`.PredictionEnsemble`.
Args:
seasonality: Seasonality to be removed. Choose from:
``["season", "month", "weekofyear", "dayofyear"]``.
Defaults to ``OPTIONS["seasonality"]``.
Examples:
>>> HindcastEnsemble
<climpred.HindcastEnsemble>
Initialized:
SST (init, lead, member) float64 -0.2392 -0.2203 ... 0.618 0.6136
Uninitialized:
SST (time, member) float64 -0.1969 -0.01221 -0.275 ... 0.4179 0.3974
Observations:
SST (time) float32 -0.4015 -0.3524 -0.1851 ... 0.2481 0.346 0.4502
>>> # example already effectively without seasonal cycle
>>> HindcastEnsemble.remove_seasonality(seasonality="month")
<climpred.HindcastEnsemble>
Initialized:
SST (init, lead, member) float64 -0.2349 -0.216 ... 0.6476 0.6433
Uninitialized:
SST (time, member) float64 -0.1789 0.005732 -0.257 ... 0.4359 0.4154
Observations:
SST (time) float32 -0.3739 -0.3248 -0.1575 ... 0.2757 0.3736 0.4778
"""
def _remove_seasonality(ds, initialized_dim="init", seasonality=None):
"""Remove the seasonal cycle from the data."""
if ds is {}:
return {}
if seasonality is None:
seasonality = OPTIONS["seasonality"]
dim = initialized_dim if initialized_dim in ds.dims else "time"
groupby = f"{dim}.{seasonality}"
if "member" in ds.dims:
clim = ds.mean("member").groupby(groupby).mean()
else:
clim = ds.groupby(groupby).mean()
anom = ds.groupby(groupby) - clim
return anom
return self.map(
_remove_seasonality,
seasonality=seasonality,
)
def _warn_if_chunked_along_init_member_time(self) -> None:
"""
Warn when ``CLIMPRED_DIMS`` except ``lead`` are wrongly chunked.
When more than one chunk to show how to circumvent ``xskillscore`` chunking
``ValueError``.
"""
suggest_one_chunk = []
for d in self.chunks:
if d in ["time", "init", "member"]:
if len(self.chunks[d]) > 1:
suggest_one_chunk.append(d)
if len(suggest_one_chunk) > 0:
name = (
str(type(self))
.replace("<class 'climpred.classes.", "")
.replace("'>", "")
)
# init cannot be dim when time chunked
suggest_one_chunk_time_to_init = suggest_one_chunk.copy()
if "time" in suggest_one_chunk_time_to_init:
suggest_one_chunk_time_to_init.remove("time")
suggest_one_chunk_time_to_init.append("init")
msg = (
f"{name} is chunked along dimensions {suggest_one_chunk} with more "
f"than one chunk. `{name}.chunks={self.chunks}`.\nYou cannot call "
f"`{name}.verify` or `{name}.bootstrap` in combination with any of "
f" {suggest_one_chunk_time_to_init} passed as `dim`. In order to do "
f"so, please rechunk {suggest_one_chunk} with `{name}.chunk("
"{{dim:-1}}).verify(dim=dim).`\nIf you do not want to use dimensions "
f" {suggest_one_chunk_time_to_init} in `{name}.verify(dim=dim)`, you "
"can disregard this warning."
)
# chunk lead:1 in HindcastEnsemble
if self.kind == "hindcast":
msg += '\nIn `HindcastEnsemble` you may also create one chunk per "\
" lead, as the `climpred` internally loops over lead, e.g. "\
" `.chunk({{"lead": 1}}).verify().`'
# chunk auto on non-climpred dims
ndims = list(self.sizes)
for d in CLIMPRED_DIMS:
if d in ndims:
ndims.remove(d)
if len(ndims) > 0:
msg += (
f"\nConsider chunking embarassingly parallel dimensions such as "
f"{ndims} automatically, i.e. "
f'`{name}.chunk({ndims[0]}="auto").verify(...).'
)
warnings.warn(msg)
def _bootstrap(
self,
metric: metricType = None,
comparison: comparisonType = None,
dim: dimType = None,
alignment: alignmentType = None,
reference: referenceType = None,
groupby: groupbyType = None,
iterations: int = None,
sig: int = 95,
resample_dim: str = None,
**metric_kwargs: metric_kwargsType,
) -> xr.Dataset:
"""PredictionEnsemble.bootstrap() parent method.
explain: OPTIONS['bootstrap_resample_skill_func']
See also:
* :py:meth:`~climpred.PerfectModelEnsemble.bootstrap`
* :py:meth:`~climpred.HindcastEnsemble.bootstrap`
"""
warn_if_chunking_would_increase_performance(
self.get_initialized(), crit_size_in_MB=50
)
_metric, _, _ = _get_metric_comparison_dim(
self.get_initialized(), metric, comparison, dim, kind=self.kind
)
if iterations is None:
raise ValueError("Designate number of bootstrapping `iterations`.")
if resample_dim is None:
resample_dim = "member"
reference = _check_valid_reference(reference)
reference = _sanitize_to_list(reference)
if reference is None:
reference = []
dim = _sanitize_to_list(dim)
if dim is None:
dim = list(self.get_initialized().isel(lead=0).dims)
verify_kwargs = dict(
dim=dim,
metric=metric,
comparison=comparison,
reference=reference,
**metric_kwargs,
)
if groupby is not None:
verify_kwargs["groupby"] = groupby
if self.kind == "hindcast":
verify_kwargs["alignment"] = alignment
if "uninitialized" in reference and not self.get_uninitialized():
warnings.warn(
"'uninitialized' in `reference` but not `uninitialized` "
"dataset. Therefore climpred does "
f"`{str(type(self))}.generate_uninitialized()`."
)
self2 = self.generate_uninitialized()
else:
self2 = self
skill = self2.verify(**verify_kwargs)
# different ways to compute resample_skill
if (
_metric.name in PEARSON_R_CONTAINING_METRICS
and self.kind == "hindcast"
and alignment
):
if ("same_verif" in alignment) & (resample_dim == "init"):
raise KeywordError(
"Cannot have alignment='same_verifs' and resample_dim='init' and "
"metric='pearson_r'. Change `resample_dim` to 'member' to keep "
"common verification alignment or `alignment` to 'same_inits' to "
"resample over initializations."
)
if OPTIONS["bootstrap_resample_skill_func"] == "default":
if (
_metric.name in PEARSON_R_CONTAINING_METRICS
and self.kind == "hindcast"
and resample_dim == "init"
):
# slow: loop and verify each time
# used for HindcastEnsemble.bootstrap(metric='acc')
resampled_skills = resample_skill_loop(
self, iterations, resample_dim, verify_kwargs
)
elif (
alignment in ["same_verifs", "same_verif"]
and self.kind == "hindcast"
and resample_dim == "init"
):
# allow https://github.com/pangeo-data/climpred/issues/582
resampled_skills = resample_skill_exclude_resample_dim_from_dim(
self, iterations, resample_dim, verify_kwargs
)
elif (
resample_dim == "init"
and self.kind == "hindcast"
and not _metric.probabilistic
and _metric.name != "rmse"
):
# fast way by verify(dim=[]) and then resampling init
# used for HindcastEnsemble.bootstrap(resample_dim='init')
resampled_skills = resample_skill_exclude_resample_dim_from_dim(
self, iterations, resample_dim, verify_kwargs
)
elif (
resample_dim == "init"
and self.kind == "hindcast"
and _metric.probabilistic
and "member" in dim
and _metric.name
not in ["rank_histogram", "discrimination", "reliability"]
):
# fast way by verify(dim=[]) and then resampling init
# used for HindcastEnsemble.bootstrap(resample_dim='init')
resampled_skills = resample_skill_exclude_resample_dim_from_dim(
self, iterations, resample_dim, verify_kwargs
)
elif (
resample_dim == "member"
and self.kind == "hindcast"
and _metric.name
in ["threshold_brier_score", "reliability", "rank_histogram"]
):
resampled_skills = resample_skill_loop(
self, iterations, resample_dim, verify_kwargs
)
elif resample_dim == "member" or self.kind == "perfect":
resampled_skills = resample_skill_resample_before(
self, iterations, resample_dim, verify_kwargs
)
else:
# slow: loop and verify each time, but always works
resampled_skills = resample_skill_loop(
self, iterations, resample_dim, verify_kwargs
)
else:
resample_skill_func = eval(
f"resample_skill_{OPTIONS['bootstrap_resample_skill_func']}"
)
resampled_skills = resample_skill_func(
self, iterations, resample_dim, verify_kwargs
)
# continue with skill and resampled_skills
if (
"uninitialized" in reference
and OPTIONS["bootstrap_uninitialized_from_iterations_mean"]
):
skill = xr.concat(
[
skill.sel(skill="initialized"),
resampled_skills.sel(skill="uninitialized").mean("iteration"),
skill.drop_sel(skill=["initialized", "uninitialized"]),
],
"skill",
coords="minimal",
)
logging.info("exchange uninit with iteration mean skill passed")
p, ci_low, ci_high = _p_ci_from_sig(sig)
ci = _distribution_to_ci(resampled_skills, ci_low, ci_high)
results_list = [
skill,
ci.sel(quantile=ci_low, drop=True),
ci.sel(quantile=ci_high, drop=True),
]
results_labels = ["verify skill", "low_ci", "high_ci"]
if reference != []:
pvalue = _pvalue_from_distributions(
resampled_skills.drop_sel(skill="initialized"),
resampled_skills.sel(skill="initialized"),
metric=_metric,
)
pvalue = xr.concat(
[
pvalue.isel(skill=0, drop=True)
.assign_coords(skill="initialized")
.where(1 == 2), # adds all NaN
pvalue,
],
"skill",
**CONCAT_KWARGS,
)
results_list.insert(1, pvalue)
results_labels.insert(1, "p")
results = xr.concat(
results_list,
dim="results",
coords="minimal",
compat="override", # take vars/coords from first
).assign_coords(
skill=["initialized"] + reference,
results=("results", results_labels),
)
if results.skill.size == 1:
results = results.isel(skill=0)
results_dims = (
["skill", "results", "lead"]
if "skill" in list(results_list[0].dims)
else ["results", "lead"]
)
results = results.transpose(*results_dims, ...)
results = assign_attrs(
results,
self.get_initialized(),
function_name=f"{type(self).__name__}.bootstrap()",
**verify_kwargs,
resample_dim=resample_dim,
sig=sig,
iterations=iterations,
)
return results
[docs]class PerfectModelEnsemble(PredictionEnsemble):
"""An object for "perfect model" prediction ensembles.
:py:class:`.PerfectModelEnsemble` is a sub-class of
:py:class:`.PredictionEnsemble`. It tracks
the control run used to initialize the ensemble for easy computations,
bootstrapping, etc.
This object is built on ``xarray`` and thus requires the input object to
be an :py:class:`xarray.Dataset` or :py:class:`xarray.DataArray`.
"""
[docs] def __init__(self, initialized: Union[xr.DataArray, xr.Dataset]) -> None:
"""Create a :py:class:`.PerfectModelEnsemble` object.
Args:
initialized: prediction ensemble output.
Attributes:
control: datasets dictionary item of control simulation associated with the
initialized ensemble.
uninitialized: datasets dictionary item of uninitialized forecast.
"""
super().__init__(initialized)
# Reserve sub-dictionary for the control simulation.
self._datasets.update({"control": {}})
self.kind = "perfect"
def _apply_climpred_function(
self,
func: Callable[..., xr.Dataset],
input_dict: Dict[str, Any],
**kwargs: Any,
) -> Union["PerfectModelEnsemble", xr.Dataset]:
"""Loop through observations and apply an arbitrary climpred function.
Args:
func: climpred function to apply to object.
input_dict: dictionary with the following things:
* ensemble: initialized or uninitialized ensemble.
* control: control dictionary from HindcastEnsemble.
* init (bool): True if the initialized ensemble, False if uninitialized.
"""
ensemble = input_dict["ensemble"]
control = input_dict["control"]
init = input_dict["init"]
init_vars, ctrl_vars = self._vars_to_drop(init=init)
ensemble = ensemble.drop_vars(init_vars)
if control:
control = control.drop_vars(ctrl_vars)
return func(ensemble, control, **kwargs)
def _vars_to_drop(self, init: bool = True) -> Tuple[List[str], List[str]]:
"""Return list of variables to drop when comparing datasets.
This is useful if the two products being compared do not share the same
variables. I.e., if the control has ["SST"] and the initialized has
["SST", "SALT"], this will return a list with ["SALT"] to be dropped
from the initialized.
Args:
init:
If ``True``, check variables on the initialized.
If ``False``, check variables on the uninitialized.
Returns:
Lists of variables to drop from the initialized/uninitialized
and control Datasets.
"""
init_str = "initialized" if init else "uninitialized"
init_vars = list(self._datasets[init_str])
# only drop if control present
if self._datasets["control"]:
ctrl_vars = list(self._datasets["control"])
# Make lists of variables to drop that aren't in common
# with one another.
init_vars_to_drop = list(set(init_vars) - set(ctrl_vars))
ctrl_vars_to_drop = list(set(ctrl_vars) - set(init_vars))
else:
init_vars_to_drop, ctrl_vars_to_drop = [], []
return init_vars_to_drop, ctrl_vars_to_drop
[docs] def add_control(
self, control: Union[xr.DataArray, xr.Dataset]
) -> "PerfectModelEnsemble":
"""Add the control run that initialized the prediction ensemble.
Args:
control: control run.
"""
# NOTE: These should all be decorators.
if isinstance(control, xr.DataArray):
control = control.to_dataset()
match_initialized_dims(self._datasets["initialized"], control)
match_initialized_vars(self._datasets["initialized"], control)
# Check that init is int, cftime, or datetime; convert ints or cftime to
# datetime.
control = convert_time_index(control, "time", "control[time]")
# Check that converted/original cftime calendar is the same as the
# initialized calendar to avoid any alignment errors.
match_calendars(self._datasets["initialized"], control, kind2="control")
datasets = self._datasets.copy()
datasets.update({"control": control})
return self._construct_direct(datasets, kind="perfect")
[docs] def generate_uninitialized(self) -> "PerfectModelEnsemble":
"""Generate an uninitialized ensemble by resampling from the control simulation.
Returns:
``uninitialzed`` resampled from ``control`` added
to :py:class:`.PerfectModelEnsemble`
"""
has_dataset(
self._datasets["control"], "control", "generate an uninitialized ensemble."
)
uninit = bootstrap_uninit_pm_ensemble_from_control_cftime(
self._datasets["initialized"], self._datasets["control"]
)
uninit.coords["valid_time"] = self.get_initialized().coords["valid_time"]
datasets = self._datasets.copy()
datasets.update({"uninitialized": uninit})
return self._construct_direct(datasets, kind="perfect")
[docs] def get_control(self) -> xr.Dataset:
"""Return the control as an :py:class:`xarray.Dataset`."""
return self._datasets["control"]
[docs] def verify(
self,
metric: metricType = None,
comparison: comparisonType = None,
dim: dimType = None,
reference: referenceType = None,
groupby: groupbyType = None,
**metric_kwargs: metric_kwargsType,
) -> xr.Dataset:
"""Verify initialized predictions against a configuration of its members.
.. note::
The configuration of the other ensemble members is based off of the
``comparison`` keyword argument.
Args:
metric: Metric to apply for verification, see `metrics <../metrics.html>`_
comparison: How to compare the initialized prediction ensemble with itself,
see `comparisons <../comparisons.html>`_.
dim: Dimension(s) over which to apply ``metric``.
``dim`` is passed on to xskillscore.{metric} and includes xskillscore's
``member_dim``. ``dim`` should contain ``member`` when ``comparison``
is probabilistic but should not contain ``member`` when
``comparison=e2c``. Defaults to ``None`` meaning that all dimensions
other than ``lead`` are reduced.
reference: Type of reference forecasts with which to verify against.
One or more of ``["uninitialized", "persistence", "climatology"]``.
Defaults to ``None`` meaning no reference.
For ``persistence``, choose between
``set_options(PerfectModel_persistence_from_initialized_lead_0)=False``
(default) using :py:func:`~climpred.reference.compute_persistence` or
``set_options(PerfectModel_persistence_from_initialized_lead_0)=True``
using
:py:func:`~climpred.reference.compute_persistence_from_first_lead`.
groupby: group ``init`` before passing ``initialized`` to ``verify``.
**metric_kwargs: Arguments passed to ``metric``.
Returns:
``initialized`` and ``reference`` forecast skill reduced by dimensions
``dim``
Example:
Root mean square error (``rmse``) comparing every member with the
ensemble mean forecast (``m2e``) for all leads reducing dimensions
``init`` and ``member``:
>>> PerfectModelEnsemble.verify(
... metric="rmse", comparison="m2e", dim=["init", "member"]
... )
<xarray.Dataset>
Dimensions: (lead: 20)
Coordinates:
* lead (lead) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Data variables:
tos (lead) float32 0.1028 0.1249 0.1443 0.1707 ... 0.2113 0.2452 0.2297
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: PerfectModelEnsemble.verify()
number_of_initializations: 12
number_of_members: 10
metric: rmse
comparison: m2e
dim: ['init', 'member']
reference: []
Continuous Ranked Probability Score (``"crps"``) comparing every member to every
other member (``"m2m"``) reducing dimensions ``member`` and ``init`` while
also calculating reference skill for the ``persistence``, ``climatology``
and ``uninitialized`` forecast.
>>> PerfectModelEnsemble.verify(
... metric="crps",
... comparison="m2m",
... dim=["init", "member"],
... reference=["persistence", "climatology", "uninitialized"],
... )
<xarray.Dataset>
Dimensions: (skill: 4, lead: 20)
Coordinates:
* lead (lead) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
* skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized'
Data variables:
tos (skill, lead) float64 0.0621 0.07352 0.08678 ... 0.122 0.1246
Attributes:
prediction_skill_software: climpred https://climp...
skill_calculated_by_function: PerfectModelEnsemble.v...
number_of_initializations: 12
number_of_members: 10
metric: crps
comparison: m2m
dim: ['init', 'member']
reference: ['persistence', 'clima...
PerfectModel_persistence_from_initialized_lead_0: False
"""
if groupby is not None:
return self._groupby(
"verify",
groupby,
reference=reference,
metric=metric,
comparison=comparison,
dim=dim,
**metric_kwargs,
)
reference = _check_valid_reference(reference)
input_dict = {
"ensemble": self._datasets["initialized"],
"control": self._datasets["control"]
if isinstance(self._datasets["control"], xr.Dataset)
else None,
"init": True,
}
result = self._apply_climpred_function(
compute_perfect_model,
input_dict=input_dict,
metric=metric,
comparison=comparison,
dim=dim,
**metric_kwargs,
)
if self._temporally_smoothed:
result = _reset_temporal_axis(result, self._temporally_smoothed, dim="lead")
result["lead"].attrs = self.get_initialized().lead.attrs
# compute reference skills
if reference:
for r in reference:
dim_orig = deepcopy(dim) # preserve dim, because
ref_compute_kwargs = deepcopy(metric_kwargs) # persistence changes dim
ref_compute_kwargs.update(
{"dim": dim_orig, "metric": metric, "comparison": comparison}
)
if r == "persistence":
if not OPTIONS["PerfectModel_persistence_from_initialized_lead_0"]:
del ref_compute_kwargs["comparison"]
ref = getattr(self, f"_compute_{r}")(**ref_compute_kwargs)
result = xr.concat([result, ref], dim="skill", **CONCAT_KWARGS)
result = result.assign_coords(skill=["initialized"] + reference)
result = assign_attrs(
result,
self.get_initialized(),
function_name="PerfectModelEnsemble.verify()",
metric=metric,
comparison=comparison,
dim=dim,
reference=reference,
**metric_kwargs,
)
return result.squeeze()
def _compute_uninitialized(
self,
metric: metricType = None,
comparison: comparisonType = None,
dim: dimType = None,
**metric_kwargs: metric_kwargsType,
) -> xr.Dataset:
"""Verify the bootstrapped uninitialized run against itself.
.. note::
The configuration of the other ensemble members is based off of the
``comparison`` keyword argument.
Args:
metric: Metric to apply for verification, see `metrics <../metrics.html>`_
comparison: How to compare the uninitialized against itself, see
`comparisons <../comparisons.html>`_.
dim: Dimension(s) over which to apply metric.
``dim`` is passed on to xskillscore.{metric} and includes xskillscore's
``member_dim``. ``dim`` should contain ``member`` when ``comparison``
is probabilistic but should not contain ``member`` when
``comparison=e2c``. Defaults to ``None``, meaning that all dimensions
other than ``lead`` are reduced.
**metric_kwargs: Arguments passed to ``metric``.
Returns:
``initialized`` and ``reference`` forecast skill reduced by dimensions
``dim``
"""
has_dataset(
self._datasets["uninitialized"],
"uninitialized",
"compute an uninitialized metric",
)
input_dict = {
"ensemble": self._datasets["uninitialized"],
"control": self._datasets["control"]
if isinstance(self._datasets["control"], xr.Dataset)
else None,
"init": False,
}
if dim is None:
dim = list(self._datasets["initialized"].isel(lead=0).dims)
res = self._apply_climpred_function(
compute_perfect_model,
input_dict=input_dict,
metric=metric,
comparison=comparison,
dim=dim,
**metric_kwargs,
)
if self._temporally_smoothed:
res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead")
res["lead"].attrs = self.get_initialized().lead.attrs
return res
def _compute_persistence(
self,
metric: metricType = None,
dim: dimType = None,
**metric_kwargs: metric_kwargsType,
):
"""Verify a simple persistence forecast of the control run against itself.
Note: uses :py:func:`~climpred.reference.compute_persistence_from_first_lead`
if ``set_options(PerfectModel_persistence_from_initialized_lead_0=True)`` else
:py:func:`~climpred.reference.compute_persistence`.
Args:
metric: Metric to apply for verification, see `metrics <../metrics.html>`_
comparison: How to compare the persistence against itself, see
`comparisons <../comparisons.html>`_. Only valid if
``PerfectModel_persistence_from_initialized_lead_0=True``.
dim: Dimension(s) over which to apply metric.
``dim`` is passed on to xskillscore.{metric} and includes xskillscore's
``member_dim``. ``dim`` should contain ``member`` when ``comparison``
is probabilistic but should not contain ``member`` when
``comparison=e2c``. Defaults to ``None``, meaning that all dimensions
other than ``lead`` are reduced.
**metric_kwargs: Arguments passed to ``metric``.
Returns:
persistence forecast skill.
References:
* Chapter 8 (Short-Term Climate Prediction) in
Van den Dool, Huug. Empirical methods in short-term climate
prediction. Oxford University Press, 2007.
"""
if dim is None:
dim = list(self._datasets["initialized"].isel(lead=0).dims)
compute_persistence_func: Callable[..., xr.Dataset]
compute_persistence_func = compute_persistence_from_first_lead
if OPTIONS["PerfectModel_persistence_from_initialized_lead_0"]:
compute_persistence_func = compute_persistence_from_first_lead
if self.get_initialized().lead[0] != 0:
if OPTIONS["warn_for_failed_PredictionEnsemble_xr_call"]:
warnings.warn(
"Calculate persistence from "
f"lead={int(self.get_initialized().lead[0].values)} instead "
"of lead=0 (recommended)."
)
else:
compute_persistence_func = compute_persistence
if self._datasets["control"] == {}:
warnings.warn(
"You may also calculate persistence based on "
"``initialized.isel(lead=0)`` by changing "
" ``set_options(PerfectModel_persistence_from_initialized_lead_0=True)``." # noqa: E501
)
has_dataset(
self._datasets["control"], "control", "compute a persistence forecast"
)
input_dict = {
"ensemble": self._datasets["initialized"],
"control": self._datasets["control"],
"init": True,
}
res = self._apply_climpred_function(
compute_persistence_func,
input_dict=input_dict,
metric=metric,
alignment="same_inits",
dim=dim,
**metric_kwargs,
)
if self._temporally_smoothed:
res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead")
res["lead"].attrs = self.get_initialized().lead.attrs
return res
def _compute_climatology(
self,
metric: metricType = None,
comparison: comparisonType = None,
dim: dimType = None,
**metric_kwargs: metric_kwargsType,
) -> xr.Dataset:
"""Verify a climatology forecast.
Args:
metric: Metric to apply for verification, see `metrics <../metrics.html>`_
comparison: How to compare the climatology against itself, see
`comparisons <../comparisons.html>`_
dim: Dimension(s) over which to apply metric.
``dim`` is passed on to xskillscore.{metric} and includes xskillscore's
``member_dim``. ``dim`` should contain ``member`` when ``comparison``
is probabilistic but should not contain ``member`` when
``comparison=e2c``. Defaults to ``None``, meaning that all dimensions
other than ``lead`` are reduced.
**metric_kwargs: Arguments passed to ``metric``.
Returns:
climatology forecast skill
References:
* Chapter 8 (Short-Term Climate Prediction) in
Van den Dool, Huug. Empirical methods in short-term climate
prediction. Oxford University Press, 2007.
"""
input_dict = {
"ensemble": self._datasets["initialized"],
"control": self._datasets["control"]
if isinstance(self._datasets["control"], xr.Dataset)
else None,
"init": True,
}
if dim is None:
dim = list(self.get_initialized().isel(lead=0).dims)
res = self._apply_climpred_function(
compute_climatology,
input_dict=input_dict,
metric=metric,
comparison=comparison,
dim=dim,
**metric_kwargs,
)
if self._temporally_smoothed:
res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead")
res["lead"].attrs = self.get_initialized().lead.attrs
return res
[docs] def bootstrap(
self,
metric: metricType = None,
comparison: comparisonType = None,
dim: dimType = None,
reference: referenceType = None,
groupby: groupbyType = None,
iterations: Optional[int] = None,
sig: int = 95,
resample_dim: str = "member",
**metric_kwargs: metric_kwargsType,
) -> xr.Dataset:
"""Bootstrap with replacement according to :cite:t:`Goddard2013`.
Args:
metric: Metric to apply for verification, see `metrics <../metrics.html>`_
comparison: How to compare the forecast against itself, see
`comparisons <../comparisons.html>`_
dim: Dimension(s) over which to apply metric.
``dim`` is passed on to xskillscore.{metric} and includes xskillscore's
``member_dim``. ``dim`` should contain ``member`` when ``comparison``
is probabilistic but should not contain ``member`` when
``comparison=e2c``. Defaults to ``None`` meaning that all dimensions
other than ``lead`` are reduced.
reference: Type of reference forecasts with which to verify against.
One or more of ``["uninitialized", "persistence", "climatology"]``.
Defaults to ``None`` meaning no reference.
If ``None`` or ``[]``, returns no p value.
For ``persistence``, choose between
``set_options(PerfectModel_persistence_from_initialized_lead_0)=False``
(default) using :py:func:`~climpred.reference.compute_persistence` or
``set_options(PerfectModel_persistence_from_initialized_lead_0)=True``
using
:py:func:`~climpred.reference.compute_persistence_from_first_lead`.
iterations: Number of resampling iterations for bootstrapping with
replacement. Recommended >= 500.
resample_dim: dimension to resample from. Defaults to `"member"``.
- "member": select a different set of members from forecast
- "init': select a different set of initializations from forecast
sig: Significance level in percent for deciding whether
uninitialized and persistence beat initialized skill.
groupby: group ``init`` before passing ``initialized`` to ``bootstrap``.
**metric_kwargs: arguments passed to ``metric``.
Returns:
:py:class:`xarray.Dataset` with dimensions ``results`` (holding
``verify skill``, ``p``, ``low_ci`` and ``high_ci``) and ``skill``
(holding ``initialized``, ``persistence`` and/or ``uninitialized``):
* results="verify skill", skill="initialized":
mean initialized skill
* results="high_ci", skill="initialized":
high confidence interval boundary for initialized skill
* results="p", skill="uninitialized":
p value of the hypothesis that the
difference of skill between the initialized and
uninitialized simulations is smaller or equal to zero
based on bootstrapping with replacement.
* results="p", skill="persistence":
p value of the hypothesis that the
difference of skill between the initialized and persistenceistence
simulations is smaller or equal to zero based on
bootstrapping with replacement.
Reference:
:cite:t:`Goddard2013`
Example:
Continuous Ranked Probability Score (``"crps"``) comparing every
member to every other member (``"m2m"``) reducing dimensions ``member`` and
``init`` 50 times after resampling ``member`` dimension with replacement.
Also calculate reference skill for the ``"persistence"``, ``"climatology"``
and ``"uninitialized"`` forecast and compare whether initialized skill is
better than reference skill: Returns verify skill, probability that
reference forecast performs better than initialized and the lower and
upper bound of the resample.
>>> PerfectModelEnsemble.bootstrap(
... metric="crps",
... comparison="m2m",
... dim=["init", "member"],
... iterations=50,
... resample_dim="member",
... reference=["persistence", "climatology", "uninitialized"],
... )
<xarray.Dataset>
Dimensions: (skill: 4, results: 4, lead: 20)
Coordinates:
* lead (lead) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
* skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized'
* results (results) <U12 'verify skill' 'p' 'low_ci' 'high_ci'
Data variables:
tos (skill, results, lead) float64 0.0621 0.07352 ... 0.117 0.09826
Attributes: (12/13)
prediction_skill_software: climpred https://climp...
skill_calculated_by_function: PerfectModelEnsemble.b...
number_of_initializations: 12
number_of_members: 10
metric: crps
comparison: m2m
... ...
reference: ['persistence', 'clima...
PerfectModel_persistence_from_initialized_lead_0: False
resample_dim: member
sig: 95
iterations: 50
confidence_interval_levels: 0.975-0.025
"""
return self._bootstrap(
metric=metric,
comparison=comparison,
dim=dim,
reference=reference,
groupby=groupby,
iterations=iterations,
sig=sig,
resample_dim=resample_dim,
**metric_kwargs,
)
[docs]class HindcastEnsemble(PredictionEnsemble):
"""An object for initialized prediction ensembles.
:py:class:`.HindcastEnsemble` is a sub-class of
:py:class:`.PredictionEnsemble`. It tracks a
verification dataset (i.e., observations) associated with the hindcast ensemble
for easy computation across multiple variables.
This object is built on :py:class:`xarray.Dataset` and thus requires the input
object to be an :py:class:`xarray.Dataset` or :py:class:`xarray.DataArray`.
"""
[docs] def __init__(self, initialized: Union[xr.DataArray, xr.Dataset]) -> None:
"""Create ``HindcastEnsemble`` from initialized prediction ensemble output.
Args:
initialized: initialized prediction ensemble output.
Attributes:
observations: datasets dictionary item of verification data to associate with
the prediction ensemble.
uninitialized: datasets dictionary item of uninitialized forecast.
"""
super().__init__(initialized)
self._datasets.update({"observations": {}})
self.kind = "hindcast"
def _apply_climpred_function(
self, func: Callable[..., xr.Dataset], init: bool, **kwargs: Any
) -> Union["HindcastEnsemble", xr.Dataset]:
"""Loop through verification data and apply an arbitrary climpred function.
Args:
func: climpred function to apply to object.
init: Whether or not it's the initialized ensemble.
"""
hind = self._datasets["initialized"]
verif = self._datasets["observations"]
drop_init, drop_obs = self._vars_to_drop(init=init)
return func(hind.drop_vars(drop_init), verif.drop_vars(drop_obs), **kwargs)
def _vars_to_drop(self, init: bool = True) -> Tuple[List[str], List[str]]:
"""Return list of variables to drop.
When comparing initialized/uninitialized to observations.
This is useful if the two products being compared do not share the same
variables. I.e., if the observations have ["SST"] and the initialized has
["SST", "SALT"], this will return a list with ["SALT"] to be dropped
from the initialized.
Args:
init:
If ``True``, check variables on the initialized.
If ``False``, check variables on the uninitialized.
Returns:
Lists of variables to drop from the initialized/uninitialized
and observational Datasets.
"""
if init:
init_vars = [var for var in self._datasets["initialized"].data_vars]
else:
init_vars = [var for var in self._datasets["uninitialized"].data_vars]
obs_vars = [var for var in self._datasets["observations"].data_vars]
# Make lists of variables to drop that aren't in common
# with one another.
init_vars_to_drop = list(set(init_vars) - set(obs_vars))
obs_vars_to_drop = list(set(obs_vars) - set(init_vars))
return init_vars_to_drop, obs_vars_to_drop
[docs] def add_observations(
self, obs: Union[xr.DataArray, xr.Dataset]
) -> "HindcastEnsemble":
"""Add verification data against which to verify the initialized ensemble.
Same as :py:meth:`.HindcastEnsemble.add_verification`.
Args:
obs: observations added to :py:class:`.HindcastEnsemble`.
"""
if isinstance(obs, xr.DataArray):
obs = obs.to_dataset()
match_initialized_dims(self._datasets["initialized"], obs)
match_initialized_vars(self._datasets["initialized"], obs)
# Check that time is int, cftime, or datetime; convert ints or cftime to
# datetime.
obs = convert_time_index(obs, "time", "obs[time]")
# Check that converted/original cftime calendar is the same as the
# initialized calendar to avoid any alignment errors.
match_calendars(self._datasets["initialized"], obs)
datasets = self._datasets.copy()
datasets.update({"observations": obs})
return self._construct_direct(datasets, kind="hindcast")
def add_verification(
self, verif: Union[xr.DataArray, xr.Dataset]
) -> "HindcastEnsemble":
"""Add verification data against which to verify the initialized ensemble.
Same as :py:meth:`.HindcastEnsemble.add_observations`.
Args:
verif: verification added to :py:class:`.HindcastEnsemble`.
"""
return self.add_observations(verif)
[docs] def add_uninitialized(
self, uninit: Union[xr.DataArray, xr.Dataset]
) -> "HindcastEnsemble":
"""Add a companion uninitialized ensemble for comparison to verification data.
Args:
uninit: uninitialzed ensemble.
"""
if isinstance(uninit, xr.DataArray):
uninit = uninit.to_dataset()
match_initialized_dims(
self._datasets["initialized"], uninit, uninitialized=True
)
match_initialized_vars(self._datasets["initialized"], uninit)
# Check that init is int, cftime, or datetime; convert ints or cftime to
# datetime.
uninit = convert_time_index(uninit, "time", "uninit[time]")
# Check that converted/original cftime calendar is the same as the
# initialized calendar to avoid any alignment errors.
match_calendars(self._datasets["initialized"], uninit, kind2="uninitialized")
datasets = self._datasets.copy()
datasets.update({"uninitialized": uninit})
return self._construct_direct(datasets, kind="hindcast")
[docs] def get_observations(self) -> xr.Dataset:
"""Return the :py:class:`xarray.Dataset` of the observations/verification data.
Returns:
observations
"""
return self._datasets["observations"]
[docs] def generate_uninitialized(
self, resample_dim: List[str] = ["init", "member"]
) -> "HindcastEnsemble":
"""Generate ``uninitialized`` by resampling from ``initialized``.
Args:
resample_dim: dimension to resample from. Must contain ``"init"``.
Returns:
resampled ``uninitialized`` ensemble added to
:py:class:`.HindcastEnsemble`
Example:
>>> HindcastEnsemble # uninitialized from historical simulations
<climpred.HindcastEnsemble>
Initialized:
SST (init, lead, member) float64 -0.2392 -0.2203 ... 0.618 0.6136
Uninitialized:
SST (time, member) float64 -0.1969 -0.01221 -0.275 ... 0.4179 0.3974
Observations:
SST (time) float32 -0.4015 -0.3524 -0.1851 ... 0.2481 0.346 0.4502
>>> HindcastEnsemble.generate_uninitialized() # newly generated from initialized
<climpred.HindcastEnsemble>
Initialized:
SST (init, lead, member) float64 -0.2392 -0.2203 ... 0.618 0.6136
Uninitialized:
SST (time, member) float64 0.04868 0.07173 0.09435 ... 0.4158 0.418
Observations:
SST (time) float32 -0.4015 -0.3524 -0.1851 ... 0.2481 0.346 0.4502
"""
uninit = resample_uninitialized_from_initialized(
self._datasets["initialized"], resample_dim=resample_dim
)
datasets = self._datasets.copy()
datasets.update({"uninitialized": uninit})
return self._construct_direct(datasets, kind="hindcast")
[docs] def plot_alignment(
self: "HindcastEnsemble",
alignment: Optional[Union[str, List[str]]] = None,
reference: Optional[referenceType] = None,
date2num_units: str = "days since 1960-01-01",
return_xr: bool = False,
cmap: str = "viridis",
edgecolors: str = "gray",
**plot_kwargs: Any,
) -> Any:
"""
Plot ``initialized`` ``valid_time`` where matching ``verification`` ``time``.
Depends on ``alignment``. Plots ``days since reference date`` controlled by
``date2num_units``. ``NaN`` / white space shows where no verification is done.
Args:
alignment: which inits or verification times should be aligned?
- ``"maximize"``: maximize the degrees of freedom by slicing
``initialized`` and ``verif`` to a common time frame at each lead.
- ``"same_inits"``: slice to a common ``init`` frame prior to computing
metric. This philosophy follows the thought that each lead should be
based on the same set of initializations.
- ``"same_verif"``: slice to a common/consistent verification time frame
prior to computing metric. This philosophy follows the thought that
each lead should be based on the same set of verification dates.
- ``None`` defaults to the three above.
reference: Type of reference forecasts with which to verify against.
One or more of ``["uninitialized", "persistence", "climatology"]``.
Defaults to ``None`` meaning no reference.
date2num_units: passed to ``cftime.date2num`` as units
return_xr: if ``True`` return :py:class:`xarray.DataArray` else plot
cmap: color palette
edgecolors: color of the edges in the plot
**plot_kwargs: arguments passed to ``plot``.
Returns:
:py:class:`xarray.DataArray` if ``return_xr`` else plot
Example:
>>> HindcastEnsemble.plot_alignment(alignment=None, return_xr=True)
<xarray.DataArray 'valid_time' (alignment: 3, lead: 10, init: 61)>
array([[[-1826., -1461., -1095., ..., nan, nan, nan],
[-1461., -1095., -730., ..., nan, nan, nan],
[-1095., -730., -365., ..., nan, nan, nan],
...,
[ 731., 1096., 1461., ..., nan, nan, nan],
[ 1096., 1461., 1827., ..., nan, nan, nan],
[ 1461., 1827., 2192., ..., nan, nan, nan]],
<BLANKLINE>
[[ nan, nan, nan, ..., 19359., 19724., 20089.],
[ nan, nan, nan, ..., 19724., 20089., nan],
[ nan, nan, nan, ..., 20089., nan, nan],
...,
[ nan, nan, 1461., ..., nan, nan, nan],
[ nan, 1461., 1827., ..., nan, nan, nan],
[ 1461., 1827., 2192., ..., nan, nan, nan]],
<BLANKLINE>
[[-1826., -1461., -1095., ..., 19359., 19724., 20089.],
[-1461., -1095., -730., ..., 19724., 20089., nan],
[-1095., -730., -365., ..., 20089., nan, nan],
...,
[ 731., 1096., 1461., ..., nan, nan, nan],
[ 1096., 1461., 1827., ..., nan, nan, nan],
[ 1461., 1827., 2192., ..., nan, nan, nan]]])
Coordinates:
* init (init) object 1954-01-01 00:00:00 ... 2014-01-01 00:00:00
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
* alignment (alignment) <U10 'same_init' 'same_verif' 'maximize'
valid_time (lead, init) object 1955-01-01 00:00:00 ... 2024-01-01 00:00:00
Attributes:
units: days since 1960-01-01
>>> HindcastEnsemble.plot_alignment(
... alignment="same_verifs"
... ) # doctest: +SKIP
<matplotlib.collections.QuadMesh object at 0x1405c1520>
See also:
https://climpred.readthedocs.io/en/stable/alignment.html.
"""
from .graphics import _verif_dates_xr
if alignment is None or alignment == []:
alignment = ["same_init", "same_verif", "maximize"]
if isinstance(alignment, str):
alignment = [alignment]
alignment_dates = []
alignments_success = []
for a in alignment:
try:
alignment_dates.append(
_verif_dates_xr(self, a, reference, date2num_units)
)
alignments_success.append(a)
except CoordinateError as e:
warnings.warn(f"alignment='{a}' failed. CoordinateError: {e}")
verif_dates_xr = (
xr.concat(
alignment_dates,
"alignment",
)
.assign_coords(alignment=alignments_success)
.squeeze()
)
if "alignment" in verif_dates_xr.dims:
plot_kwargs["col"] = "alignment"
if return_xr:
return add_time_from_init_lead(verif_dates_xr)
try:
import nc_time_axis # noqa:
assert int(nc_time_axis.__version__.replace(".", "")) >= 140
return verif_dates_xr.plot(cmap=cmap, edgecolors=edgecolors, **plot_kwargs)
except ImportError:
raise ValueError("nc_time_axis>1.4.0 required for plotting.")
[docs] def verify(
self,
metric: metricType = None,
comparison: comparisonType = None,
dim: dimType = None,
alignment: alignmentType = None,
reference: referenceType = None,
groupby: groupbyType = None,
**metric_kwargs: metric_kwargsType,
) -> xr.Dataset:
"""Verify the initialized ensemble against observations.
.. note::
This will automatically verify against all shared variables
between the initialized ensemble and observations/verification data.
Args:
metric: Metric to apply for verification, see `metrics <../metrics.html>`_
comparison: How to compare to the observations/verification data.
See `comparisons <../comparisons.html>`_.
dim: Dimension(s) to apply metric over. ``dim`` is passed
on to xskillscore.{metric} and includes xskillscore's ``member_dim``.
``dim`` should contain ``member`` when ``comparison`` is probabilistic
but should not contain ``member`` when ``comparison=e2o``. Defaults to
``None`` meaning that all dimensions other than ``lead`` are reduced.
alignment: which inits or verification times should be aligned?
- ``"maximize"``: maximize the degrees of freedom by slicing
``initialized`` and ``verif`` to a common time frame at each lead.
- ``"same_inits"``: slice to a common ``init`` frame prior to computing
metric. This philosophy follows the thought that each lead should be
based on the same set of initializations.
- ``"same_verif"``: slice to a common/consistent verification time frame
prior to computing metric. This philosophy follows the thought that
each lead should be based on the same set of verification dates.
reference: Type of reference forecasts with which to verify against.
One or more of ``["uninitialized", "persistence", "climatology"]``.
Defaults to ``None`` meaning no reference.
groupby: group ``init`` before passing ``initialized`` to ``verify``.
**metric_kwargs: arguments passed to ``metric``.
Returns:
``initialized`` and ``reference`` forecast skill reduced by dimensions
``dim``
Example:
Continuous Ranked Probability Score (``crps``) comparing every member with the
verification (``m2o``) over the same verification time (``same_verifs``)
for all leads reducing dimensions ``init`` and ``member``:
>>> HindcastEnsemble.verify(
... metric="crps",
... comparison="m2o",
... alignment="same_verifs",
... dim=["init", "member"],
... )
<xarray.Dataset>
Dimensions: (lead: 10)
Coordinates:
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
skill <U11 'initialized'
Data variables:
SST (lead) float64 0.05208 0.05009 0.05489 ... 0.09261 0.1083 0.1176
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_initializations: 64
number_of_members: 10
alignment: same_verifs
metric: crps
comparison: m2o
dim: ['init', 'member']
reference: []
Root mean square error (``"rmse"``) comparing the ensemble mean with
the verification (``"e2o"``) over the same initializations
(``"same_inits"``) for all leads reducing dimension ``init`` while also
calculating reference skill for the ``"persistence"``, ``"climatology"``
and ``"uninitialized"`` forecast.
>>> HindcastEnsemble.verify(
... metric="rmse",
... comparison="e2o",
... alignment="same_inits",
... dim="init",
... reference=["persistence", "climatology", "uninitialized"],
... )
<xarray.Dataset>
Dimensions: (skill: 4, lead: 10)
Coordinates:
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
* skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized'
Data variables:
SST (skill, lead) float64 0.08135 0.08254 0.086 ... 0.1012 0.1017
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_initializations: 64
number_of_members: 10
alignment: same_inits
metric: rmse
comparison: e2o
dim: init
reference: ['persistence', 'climatology', 'uninitiali...
The verification does not need to reduce a dimension. To obtain the skill for
each initialization, set ``dim=[]``.
>>> HindcastEnsemble.verify(
... metric="rmse",
... comparison="e2o",
... alignment="same_verifs",
... dim=[],
... )
<xarray.Dataset>
Dimensions: (init: 61, lead: 10)
Coordinates:
* init (init) object 1954-01-01 00:00:00 ... 2014-01-01 00:00:00
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
valid_time (lead, init) object 1955-01-01 00:00:00 ... 2024-01-01 00:00:00
skill <U11 'initialized'
Data variables:
SST (lead, init) float64 nan nan nan nan nan ... nan nan nan nan nan
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_members: 10
alignment: same_verifs
metric: rmse
comparison: e2o
dim: []
reference: []
"""
if groupby is not None:
return self._groupby(
"verify",
groupby,
reference=reference,
metric=metric,
comparison=comparison,
dim=dim,
alignment=alignment,
**metric_kwargs,
)
if dim is None:
viable_dims = list(self.get_initialized().isel(lead=0).dims) + [[]]
raise ValueError(
"Designate a dimension to reduce over when applying the "
f"metric. Got {dim}. Choose one or more of {viable_dims}"
)
if ("member" in dim) and comparison not in ["m2o", "m2r"]:
raise ValueError(
"Comparison must equal 'm2o' with dim='member'. "
f"Got comparison {comparison}."
)
reference = _check_valid_reference(reference)
def _verify(
hind,
verif,
hist,
reference,
metric,
comparison,
alignment,
dim,
**metric_kwargs,
):
"""Interior verify func to be passed to apply func."""
metric, comparison, dim = _get_metric_comparison_dim(
hind, metric, comparison, dim, kind=self.kind
)
forecast, verif = comparison.function(hind, verif, metric=metric)
if metric.name == "rps": # modify metric_kwargs for rps
metric_kwargs = broadcast_metric_kwargs_for_rps(
forecast, verif, metric_kwargs
)
forecast = forecast.rename({"init": "time"})
inits, verif_dates = return_inits_and_verif_dates(
forecast,
verif,
alignment,
reference=reference,
hist=hist,
)
metric_over_leads = [
_apply_metric_at_given_lead(
verif,
verif_dates,
lead,
initialized=forecast,
hist=hist,
inits=inits,
# Ensure apply metric function returns skill and not reference
# results.
reference=None,
metric=metric,
comparison=comparison,
dim=dim,
**metric_kwargs,
)
for lead in forecast["lead"].data
]
result = xr.concat(metric_over_leads, dim="lead") # , **CONCAT_KWARGS)
result["lead"] = forecast["lead"]
if reference is not None:
if "member" in verif.dims: # if broadcasted before
verif = verif.isel(member=0)
for r in reference:
metric_over_leads = [
_apply_metric_at_given_lead(
verif,
verif_dates,
lead,
initialized=forecast,
hist=hist,
inits=inits,
reference=r,
metric=metric,
comparison=comparison,
dim=dim,
**metric_kwargs,
)
for lead in forecast["lead"].data
]
ref = xr.concat(
metric_over_leads, dim="lead", **CONCAT_KWARGS
).assign_coords(lead=forecast.lead.values)
# fix to get no member dim for uninitialized e2o skill #477
if (
r == "uninitialized"
and comparison.name == "e2o"
and "member" in ref.dims
):
ref = ref.mean("member")
if "time" in ref.dims and "time" not in result.dims:
ref = ref.rename({"time": "init"})
# fix #735
if (
r == "uninitialized"
and "member" in dim
and "member" in ref.dims
):
ref = ref.mean("member")
result = xr.concat([result, ref], dim="skill", **CONCAT_KWARGS)
# rename back to 'init'
if "time" in result.dims:
result = result.swap_dims({"time": "init"})
if "time" in result.coords:
if "init" in result.coords["time"].dims:
result = result.rename({"time": "init"})
# Add dimension/coordinate for different references.
result = result.assign_coords(skill=["initialized"] + reference)
return result.squeeze()
has_dataset(
self._datasets["observations"], "observational", "verify a forecast"
)
if "uninitialized" in reference:
has_dataset(
self._datasets["uninitialized"],
"uninitialized",
"compute an uninitialized reference forecast",
)
hist = self._datasets["uninitialized"]
else:
hist = None
res = self._apply_climpred_function(
_verify,
init=True,
metric=metric,
comparison=comparison,
alignment=alignment,
dim=dim,
hist=hist,
reference=reference,
**metric_kwargs,
)
if self._temporally_smoothed:
res = _reset_temporal_axis(res, self._temporally_smoothed, dim="lead")
res["lead"].attrs = self.get_initialized().lead.attrs
if (
"valid_time" in res.coords
): # NaNs in valid_time only happen for same_verifs and dim=[]
if res.coords["valid_time"].isnull().any():
res.coords["valid_time"] = self.get_initialized().coords["valid_time"]
res = assign_attrs(
res,
self.get_initialized(),
function_name="HindcastEnsemble.verify()",
metric=metric,
comparison=comparison,
dim=dim,
alignment=alignment,
reference=reference,
**metric_kwargs,
)
return res
[docs] def bootstrap(
self,
metric: metricType = None,
comparison: comparisonType = None,
dim: dimType = None,
alignment: alignmentType = None,
reference: referenceType = None,
groupby: groupbyType = None,
iterations: Optional[int] = None,
sig: int = 95,
resample_dim: str = "member",
**metric_kwargs: metric_kwargsType,
) -> xr.Dataset:
"""Bootstrap with replacement according to :cite:t:`Goddard2013`.
Args:
metric: Metric to apply for verification, see `metrics <../metrics.html>`_
comparison: How to compare to the observations/verification data.
See `comparisons <../comparisons.html>`_.
dim: Dimension(s) to apply metric over. ``dim`` is passed
on to xskillscore.{metric} and includes xskillscore's ``member_dim``.
``dim`` should contain ``member`` when ``comparison`` is probabilistic
but should not contain ``member`` when ``comparison="e2o"``. Defaults to
``None`` meaning that all dimensions other than ``lead`` are reduced.
reference: Type of reference forecasts with which to verify against.
One or more of ``["uninitialized", "persistence", "climatology"]``.
Defaults to ``None`` meaning no reference.
If ``None`` or ``[]``, returns no p value.
alignment: which inits or verification times should be aligned?
- ""maximize: maximize the degrees of freedom by slicing ``init`` and
``verif`` to a common time frame at each lead.
- ``"same_inits"``: slice to a common ``init`` frame prior to computing
metric. This philosophy follows the thought that each lead should be
based on the same set of initializations.
- ``"same_verif"``: slice to a common/consistent verification time frame
prior to computing metric. This philosophy follows the thought that
each lead should be based on the same set of verification dates.
iterations: Number of resampling iterations for bootstrapping with
replacement. Recommended >= 500.
sig: Significance level in percent for deciding whether
uninitialized and persistence beat initialized skill.
resample_dim: dimension to resample from. Default: ``"member"``.
- ``"member"``: select a different set of members from hind
- ``"init"``: select a different set of initializations from hind
groupby: group ``init`` before passing ``initialized`` to ``bootstrap``.
**metric_kwargs: arguments passed to ``metric``.
Returns:
:py:class:`xarray.Dataset` with dimensions ``results`` (holding ``skill``,
``p``, ``low_ci`` and ``high_ci``) and ``skill`` (holding ``initialized``,
``persistence`` and/or ``uninitialized``):
* results="verify skill", skill="initialized":
mean initialized skill
* results="high_ci", skill="initialized":
high confidence interval boundary for initialized skill
* results="p", skill="uninitialized":
p value of the hypothesis that the
difference of skill between the initialized and
uninitialized simulations is smaller or equal to zero
based on bootstrapping with replacement.
* results="p", skill="persistence":
p value of the hypothesis that the
difference of skill between the initialized and persistence
simulations is smaller or equal to zero based on
bootstrapping with replacement.
References:
:cite:t:`Goddard2013`
Example:
Continuous Ranked Probability Score (``"crps"``) comparing every member
forecast to the verification (``"m2o"``) over the same initializations
(``"same_inits"``) for all leads reducing dimension ``member`` 50 times
after resampling ``member`` dimension with replacement. Note that dimension
``init`` remains.
Also calculate reference skill for the ``"persistence"``, ``"climatology"``
and ``"uninitialized"`` forecast and compare whether initialized skill is
better than reference skill: Returns verify skill, probability that
reference forecast performs better than initialized and the lower and
upper bound of the resample.
>>> HindcastEnsemble.bootstrap(
... metric="crps",
... comparison="m2o",
... dim="member",
... iterations=50,
... resample_dim="member",
... alignment="same_inits",
... reference=["persistence", "climatology", "uninitialized"],
... )
<xarray.Dataset>
Dimensions: (skill: 4, results: 4, lead: 10, init: 51)
Coordinates:
* init (init) object 1955-01-01 00:00:00 ... 2005-01-01 00:00:00
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
valid_time (lead, init) object 1956-01-01 00:00:00 ... 2015-01-01 00:00:00
* skill (skill) <U13 'initialized' 'persistence' ... 'uninitialized'
* results (results) <U12 'verify skill' 'p' 'low_ci' 'high_ci'
Data variables:
SST (skill, results, lead, init) float64 0.1202 0.01764 ... 0.07578
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.bootstrap()
number_of_members: 10
alignment: same_inits
metric: crps
comparison: m2o
dim: member
reference: ['persistence', 'climatology', 'uninitiali...
resample_dim: member
sig: 95
iterations: 50
confidence_interval_levels: 0.975-0.025
"""
return self._bootstrap(
metric=metric,
comparison=comparison,
dim=dim,
alignment=alignment,
reference=reference,
groupby=groupby,
iterations=iterations,
sig=sig,
resample_dim=resample_dim,
**metric_kwargs,
)
[docs] def remove_bias(
self,
alignment: alignmentType = None,
how: str = "additive_mean",
train_test_split: str = "unfair",
train_init: Optional[Union[xr.DataArray, slice]] = None,
train_time: Optional[Union[xr.DataArray, slice]] = None,
cv: Union[bool, str] = False,
**metric_kwargs: metric_kwargsType,
) -> "HindcastEnsemble":
"""Remove bias from :py:class:`.HindcastEnsemble`.
Bias is grouped by ``seasonality`` set via
:py:class:`~climpred.options.set_options`. When wrapping
:py:class:`xclim.sdba.adjustment.TrainAdjust` use ``group`` instead.
Args:
alignment: which inits or verification times should be aligned?
- ``""maximize``: maximize the degrees of freedom by slicing
``initialized`` and ``verif`` to a common time frame at each lead.
- ``"same_inits"``: slice to a common ``init`` frame prior to computing
metric. This philosophy follows the thought that each lead should be
based on the same set of initializations.
- ``"same_verif"``: slice to a common/consistent verification time frame
prior to computing metric. This philosophy follows the thought that
each lead should be based on the same set of verification dates.
how: what kind of bias removal to perform.
Defaults to ``"additive_mean"``. Select from:
- ``"additive_mean"``: correcting the mean forecast additively
- ``"multiplicative_mean"``: correcting the mean forecast
multiplicatively
- ``"multiplicative_std"``: correcting the standard deviation
multiplicatively
- ``"modified_quantile"``: `Reference <https://www.sciencedirect.com/science/article/abs/pii/S0034425716302000?via%3Dihub>`_
- ``"basic_quantile"``: `Reference <https://rmets.onlinelibrary.wiley.com/doi/pdf/10.1002/joc.2168>`_
- ``"gamma_mapping"``: `Reference <https://www.hydrol-earth-syst-sci.net/21/2649/2017/>`_
- ``"normal_mapping"``: `Reference <https://www.hydrol-earth-syst-sci.net/21/2649/2017/>`_
- :py:class:`xclim.sdba.adjustment.EmpiricalQuantileMapping`
- :py:class:`xclim.sdba.adjustment.DetrendedQuantileMapping`
- :py:class:`xclim.sdba.adjustment.PrincipalComponents`
- :py:class:`xclim.sdba.adjustment.QuantileDeltaMapping`
- :py:class:`xclim.sdba.adjustment.Scaling`
- :py:class:`xclim.sdba.adjustment.LOCI`
train_test_split: How to separate train period to calculate the bias
and test period to apply bias correction to? For a detailed
description, see `Risbey et al. 2021 <http://www.nature.com/articles/s41467-021-23771-z>`_:
- ``"fair"```: no overlap between ``train`` and ``test`` (recommended).
Set either ``train_init`` or ``train_time``.
- ``"unfair"``: completely overlapping ``train`` and ``test`` (default).
- ``"unfair-cv"```: overlapping ``train`` and ``test`` except for
current `init`, which is
`left out <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#Leave-one-out_cross-validation>`_
(set ``cv="LOO"``).
train_init: Define initializations for training
when ``alignment="same_inits/maximize"``.
train_time: Define time for training when ``alignment="same_verif"``.
cv: Only relevant when ``train_test_split="unfair-cv"``.
Defaults to ``False``.
- ``True/"LOO"``: Calculate bias by `leaving given initialization out <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#Leave-one-out_cross-validation>`_
- ``False``: include all initializations in the calculation of bias,
which is much faster and but yields similar skill with a large N of
initializations.
**metric_kwargs: passed to ``xclim.sdba`` (including ``group``)
or ``XBias_Correction``
Returns:
bias removed :py:class:`.HindcastEnsemble`.
Example:
Skill from raw model output without bias reduction:
>>> HindcastEnsemble.verify(
... metric="rmse", comparison="e2o", alignment="maximize", dim="init"
... )
<xarray.Dataset>
Dimensions: (lead: 10)
Coordinates:
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
skill <U11 'initialized'
Data variables:
SST (lead) float64 0.08359 0.08141 0.08362 ... 0.1361 0.1552 0.1664
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_initializations: 64
number_of_members: 10
alignment: maximize
metric: rmse
comparison: e2o
dim: init
reference: []
Note that this HindcastEnsemble is already bias reduced, therefore
``train_test_split="unfair"`` has hardly any effect. Use all
initializations to calculate bias and verify skill:
>>> HindcastEnsemble.remove_bias(
... alignment="maximize", how="additive_mean", test_train_split="unfair"
... ).verify(
... metric="rmse", comparison="e2o", alignment="maximize", dim="init"
... )
<xarray.Dataset>
Dimensions: (lead: 10)
Coordinates:
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
skill <U11 'initialized'
Data variables:
SST (lead) float64 0.08349 0.08039 0.07522 ... 0.07305 0.08107 0.08255
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_initializations: 64
number_of_members: 10
alignment: maximize
metric: rmse
comparison: e2o
dim: init
reference: []
Separate initializations 1954 - 1980 to calculate bias. Note that
this HindcastEnsemble is already bias reduced, therefore
``train_test_split="fair"`` worsens skill here. Generally,
``train_test_split="fair"`` is recommended to use for a fair
comparison against real-time forecasts.
>>> HindcastEnsemble.remove_bias(
... alignment="maximize",
... how="additive_mean",
... train_test_split="fair",
... train_init=slice("1954", "1980"),
... ).verify(
... metric="rmse", comparison="e2o", alignment="maximize", dim="init"
... )
<xarray.Dataset>
Dimensions: (lead: 10)
Coordinates:
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
skill <U11 'initialized'
Data variables:
SST (lead) float64 0.132 0.1085 0.08722 ... 0.08209 0.08969 0.08732
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_initializations: 37
number_of_members: 10
alignment: maximize
metric: rmse
comparison: e2o
dim: init
reference: []
Wrapping methods ``how`` from
`xclim <https://xclim.readthedocs.io/en/stable/sdba_api.html>`_ and
providing ``group`` for ``groupby``:
>>> HindcastEnsemble.remove_bias(
... alignment="same_init",
... group="init",
... how="DetrendedQuantileMapping",
... train_test_split="unfair",
... ).verify(
... metric="rmse", comparison="e2o", alignment="maximize", dim="init"
... )
<xarray.Dataset>
Dimensions: (lead: 10)
Coordinates:
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
skill <U11 'initialized'
Data variables:
SST (lead) float64 0.09841 0.09758 0.08238 ... 0.0771 0.08119 0.08322
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_initializations: 52
number_of_members: 10
alignment: maximize
metric: rmse
comparison: e2o
dim: init
reference: []
Wrapping methods ``how`` from `bias_correction <https://github.com/pankajkarman/bias_correction/blob/master/bias_correction.py>`_:
>>> HindcastEnsemble.remove_bias(
... alignment="same_init",
... how="modified_quantile",
... train_test_split="unfair",
... ).verify(
... metric="rmse", comparison="e2o", alignment="maximize", dim="init"
... )
<xarray.Dataset>
Dimensions: (lead: 10)
Coordinates:
* lead (lead) int32 1 2 3 4 5 6 7 8 9 10
skill <U11 'initialized'
Data variables:
SST (lead) float64 0.07628 0.08293 0.08169 ... 0.1577 0.1821 0.2087
Attributes:
prediction_skill_software: climpred https://climpred.readthedocs.io/
skill_calculated_by_function: HindcastEnsemble.verify()
number_of_initializations: 52
number_of_members: 10
alignment: maximize
metric: rmse
comparison: e2o
dim: init
reference: []
""" # noqa: E501
if train_test_split not in BIAS_CORRECTION_TRAIN_TEST_SPLIT_METHODS:
raise NotImplementedError(
f"train_test_split='{train_test_split}' not implemented. Please choose "
f"`train_test_split` from {BIAS_CORRECTION_TRAIN_TEST_SPLIT_METHODS}, "
"see Risbey et al. 2021 "
"http://www.nature.com/articles/s41467-021-23771-z for description and "
"https://github.com/pangeo-data/climpred/issues/648 for implementation "
"status."
)
alignment = _check_valid_alignment(alignment)
if train_test_split in ["fair"]:
if (
(train_init is None)
or not isinstance(train_init, (slice, xr.DataArray))
) and (alignment in ["same_inits", "maximize"]):
raise ValueError(
f'When alignment="{alignment}", please provide `train_init` as '
f"`xr.DataArray`, e.g. "
'`HindcastEnsemble.coords["init"].slice(start, end)` '
"or slice, e.g. `slice(start, end)`, got `train_init={train_init}`."
)
if (
(train_time is None)
or not isinstance(train_time, (slice, xr.DataArray))
) and (alignment in ["same_verif"]):
raise ValueError(
f'When alignment="{alignment}", please provide `train_time` as '
"`xr.DataArray`, e.g. "
'`HindcastEnsemble.coords["time"].slice(start, end)` '
"or slice, e.g. `slice(start, end)`, got `train_time={train_time}`."
)
if isinstance(train_init, slice):
train_init = self.coords["init"].sel(init=train_init)
if isinstance(train_time, slice):
train_time = self.coords["time"].sel(time=train_time)
if how == "mean":
how = "additive_mean" # backwards compatibility
if how in ["additive_mean", "multiplicative_mean", "multiplicative_std"]:
func = gaussian_bias_removal
elif how in BIAS_CORRECTION_BIAS_CORRECTION_METHODS:
func = bias_correction
elif how in XCLIM_BIAS_CORRECTION_METHODS:
func = xclim_sdba
else:
raise NotImplementedError(
f"bias removal '{how}' is not implemented, please choose from "
f" {INTERNAL_BIAS_CORRECTION_METHODS+BIAS_CORRECTION_BIAS_CORRECTION_METHODS}." # noqa: E501
)
if train_test_split in ["unfair-cv"]:
if cv not in [True, "LOO"]:
raise ValueError(
f"Please provide cross-validation keyword `cv='LOO'` when using "
f"`train_test_split='unfair-cv'`, found `cv='{cv}'`."
)
else:
cv = "LOO" # backward compatibility
if cv not in CROSS_VALIDATE_METHODS:
raise NotImplementedError(
f"Cross validation method {cv} not implemented. "
f"Please choose cv from {CROSS_VALIDATE_METHODS}."
)
metric_kwargs["cv"] = cv
old_coords = self.get_initialized().coords
self = func(
self,
alignment=alignment,
how=how,
train_test_split=train_test_split,
train_init=train_init,
train_time=train_time,
**metric_kwargs,
)
# TODO: find better location to prevent valid_time with member dims
if "valid_time" in self._datasets["initialized"].coords:
if "member" in self._datasets["initialized"].coords["valid_time"].dims:
self._datasets["initialized"].coords["valid_time"] = (
self._datasets["initialized"]
.coords["valid_time"]
.isel(member=0, drop=True)
)
# to avoid ValueError: conflicting sizes for dimension 'time': length 1 on the
# data but length n on coordinate 'time'
if "valid_time" in self.get_initialized().coords:
if self.get_initialized().coords["valid_time"].isnull().any():
self._datasets["initialized"] = self._datasets["initialized"].dropna(
"init"
)
# avoid single-item lead dimension being dropped, see #771
for c in ["lead", "init"]:
if c not in self.get_initialized().dims and c in old_coords:
if old_coords[c].size == 1:
self._datasets["initialized"] = self._datasets[
"initialized"
].assign_coords({c: old_coords[c]})
if "valid_time" in self.get_initialized().coords:
if len(self.get_initialized().coords["valid_time"].dims) == 1 and set(
["init", "lead"]
).issubset(set(self.get_initialized().dims)):
self._datasets["initialized"].coords[
"valid_time"
] = add_time_from_init_lead(
self.get_initialized().drop("valid_time")
).coords[
"valid_time"
]
return self