#!conda install intake fsspec intake-xarray intake-thredds -c conda-forge -y
#!conda install eccodes cfgrib -c conda-forge -y
#!pip install climetlab --quiet
# linting
%load_ext nb_black
%load_ext lab_black

Calculate skill for NWP model GEFS for 6-hourly global forecasts

import intake
import fsspec  # caching downloads

# specify caching location, where to store files to with their original names
fsspec.config.conf["simplecache"] = {
    "cache_storage": "../my_caching_folder",
    "same_names": True,
}
import intake_xarray  # access files hosted via THREDDS
import climetlab  # download ERA5

import xarray as xr
import numpy as np

import climpred  # forecast verification
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
/tmp/ipykernel_2835/4189329660.py in <module>
----> 1 import intake
      2 import fsspec  # caching downloads
      3 
      4 # specify caching location, where to store files to with their original names
      5 fsspec.config.conf["simplecache"] = {

ModuleNotFoundError: No module named 'intake'

Find the data

GEFS output can be found on a THREDDS server: https://www.ncei.noaa.gov/thredds/catalog/model-gefs-003/202008/20200831/catalog.html

Here, we use intake-thredds to access the THREDDS catalog, intake-xarray to access the files and cache them with fsspec. However, you can also download the files manually, e.g. with wget:

  • https://intake.readthedocs.io/en/latest/

  • https://intake-xarray.readthedocs.io/en/latest/

  • https://intake-thredds.readthedocs.io/en/latest/

  • https://filesystem-spec.readthedocs.io/en/latest/

# all the metadata about GEFS
cat = intake.open_thredds_cat(
    "https://www.ncei.noaa.gov/thredds/catalog/model-gefs-003/202008/20200831/catalog.html"
)
cat
/Users/aaron.spring/anaconda3/envs/climpred-dev/lib/python3.9/site-packages/siphon/catalog.py:292: UserWarning: URL https://www.ncei.noaa.gov/thredds/catalog/model-gefs-003/202008/20200831/catalog.html returned HTML. Changing to: https://www.ncei.noaa.gov/thredds/catalog/model-gefs-003/202008/20200831/catalog.xml
  warnings.warn('URL {} returned HTML. Changing to: {}'.format(self.catalog_url,
No name found:
  args:
    url: https://www.ncei.noaa.gov/thredds/catalog/model-gefs-003/202008/20200831/catalog.html
  description: ''
  driver: intake_thredds.cat.ThreddsCatalog
  metadata:
    authority:
    - gov.noaa.ncdc
    contributor:
      Scientific Contact:
      - Zoltar Toth
      Technical Contact:
      - Yuejian Zhu
    creator:
    - {}
    dataType: GRID
    date:
    - type: created
      value: '2007-03-27'
    - type: issued
      value: '2007-10-01'
    documentation:
      abstract:
      - The Global Ensemble Forecast System (GEFS) is a weather forecast model made
        up of 21 separate forecasts, or ensemble members. The National Centers for
        Environmental Prediction (NCEP) started the GEFS to address the nature of
        uncertainty in weather observations, which are used to initialize weather
        forecast models. The proverbial butterfly flapping her wings can have a cascading
        effect leading to wind gusts thousands of miles away. This extreme example
        illustrates that tiny, unnoticeable differences between reality and what is
        actually measured can, over time, lead to noticeable differences between what
        a weather model forecast predicts and reality itself. The GEFS attempts to
        quantify the amount of uncertainty in a forecast by generating an ensemble
        of multiple forecasts, each minutely different, or perturbed, from the original
        observations. With global coverage, GEFS is produced four times a day with
        weather forecasts going out to 16 days and a 6 hour temporal resolution.
      comment:
      - Quantify the amount of uncertainty in mesoscale guidance by generating an
        ensemble of multiple forecasts, each minutely different, or perturbed, from
        the original observations.
      format:
      - GRIB 2
      funding:
      - DOC/NOAA/NWS/NCEP/EMC > Environmental Modeling Center, National Centers for
        Environmental Prediction, National Weather Service, NOAA, U.S. Department
        of Commerce
      rights:
      - Electronic downloads of the data are free, however fees apply for data certification
        and distribution of the data on physical media. Fees vary based on order specifications.
      summary:
      - The Global Ensemble Forecast System (GEFS) is a weather forecast model made
        up of 21 separate forecasts, or ensemble members. The National Centers for
        Environmental Prediction (NCEP) started the GEFS to address the nature of
        uncertainty in weather observations, which are used to initialize weather
        forecast models. The proverbial butterfly flapping her wings can have a cascading
        effect leading to wind gusts thousands of miles away. This extreme example
        illustrates that tiny, unnoticeable differences between reality and what is
        actually measured can, over time, lead to noticeable differences between what
        a weather model forecast predicts and reality itself. The GEFS attempts to
        quantify the amount of uncertainty in a forecast by generating an ensemble
        of multiple forecasts, each minutely different, or perturbed, from the original
        observations. With global coverage, GEFS is produced four times a day with
        weather forecasts going out to 16 days and a 6 hour temporal resolution. This
        dataset has 1.0 degree horizontal resolution.
      xlink:
      - href: http://www.nco.ncep.noaa.gov/pmb/docs/grib2/
        title: Format Specification
      - href: http://www.emc.ncep.noaa.gov/gmb/ens/ens_info.html
        title: Reference Papers
    external_metadata:
      ISO 19115-2:2009(E) - Collection Level Metadata: http://gis.ncdc.noaa.gov/geoportal/rest/document?id=%7B643C822F-67BC-4357-B280-C9A8821B91D3%7D
    geospatialCoverage:
    - {}
    inherited: true
    keyword:
    - name: "GOVERNMENT AGENCIES-U.S. FEDERAL AGENCIES, DOC, NOAA, DOC/NOAA/NESDIS/NCEI,\
        \ \n\t  National Centers for Environmental Information, NESDIS, U.S. Department\
        \ of Commerce, \n\t  EARTH SCIENCE SERVICES, MODELS, EARTH SCIENCE REANALYSES/ASSIMILATION\
        \ MODELS"
      vocabulary: GCMD
    project:
    - name: Global Ensemble Forecast System (GEFS)
    publisher:
    - {}
    serviceName: ALL
    timeCoverage:
    - {}

Opening without backend_kwargs raised DatasetBuildError. Need to specify variable by filter_by_keys for grib files.

DatasetBuildError: multiple values for unique key, try re-open the file with one of:

  • filter_by_keys={‘typeOfLevel’: ‘isobaricInhPa’}

  • filter_by_keys={‘typeOfLevel’: ‘surface’}

  • filter_by_keys={‘typeOfLevel’: ‘depthBelowLandLayer’}

  • filter_by_keys={‘typeOfLevel’: ‘heightAboveGround’}

  • filter_by_keys={‘typeOfLevel’: ‘atmosphereSingleLayer’}

  • filter_by_keys={‘typeOfLevel’: ‘atmosphere’}

  • filter_by_keys={‘typeOfLevel’: ‘nominalTop’}

  • filter_by_keys={‘typeOfLevel’: ‘pressureFromGroundLayer’}

  • filter_by_keys={‘typeOfLevel’: ‘meanSea’}

# how to open grib files: https://github.com/ecmwf/cfgrib/issues/170
intake_xarray.NetCDFSource(
    "simplecache::https://www.ncei.noaa.gov/thredds/fileServer/model-gefs-003/202008/20200831/gens-a_3_20200831_1800_000_20.grb2",
    xarray_kwargs=dict(
        engine="cfgrib",
        backend_kwargs=dict(
            filter_by_keys={"typeOfLevel": "heightAboveGround", "shortName": "2t"}
        ),
    ),
).to_dask().coords
Coordinates:
    number             int64 ...
    time               datetime64[ns] ...
    step               timedelta64[ns] ...
    heightAboveGround  float64 ...
  * latitude           (latitude) float64 90.0 89.0 88.0 ... -88.0 -89.0 -90.0
  * longitude          (longitude) float64 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0
    valid_time         datetime64[ns] ...

Get forecasts

inits_time = "0000"  # get forecasts started at 00:00
inits = ["20200829", "20200830", "20200831"]  # four initial dates
members = range(5)  # 5 members out of 20
leads = np.arange(0, 6 * 4 * 2 + 1, 6)  # 6h lead forecasts, 9 leads upto 48h
for init in inits:
    for lead in leads:
        for member in members:
            try:
                url = f"https://www.ncei.noaa.gov/thredds/fileServer/model-gefs-003/202008/{init}/gens-a_3_{init}_{inits_time}_{str(lead).zfill(3)}_{str(member).zfill(2)}.grb2"
                # print(f'download init = {init}, lead = {lead}, member = {member}')
                intake_xarray.NetCDFSource(
                    f"simplecache::{url}",
                    xarray_kwargs=dict(
                        engine="cfgrib",
                        backend_kwargs=dict(
                            filter_by_keys={
                                "typeOfLevel": "heightAboveGround",
                                "cfVarName": "t2m",
                            }
                        ),
                    ),
                ).to_dask()
            except Exception as e:
                print("failed", type(e).__name__, e)
init = xr.concat(
        [xr.concat(
            [xr.open_mfdataset(f'../my_caching_folder/gens-a_3_{init}_{inits_time}_{str(lead).zfill(3)}_*.grb2',
                               concat_dim='member', combine='nested',
                               engine='cfgrib', backend_kwargs=dict(filter_by_keys={'typeOfLevel': 'heightAboveGround', "cfVarName": "t2m"}))
             for lead in leads],
             dim='step') for init in inits],
        dim='time')
# save time when reproducing
init = init.compute()
init.to_netcdf('tmp_GEFS_a.nc')
init = xr.open_dataset("tmp_GEFS_a.nc")

Get observations

climetlab wraps cdsapi to download from the Copernicus Climate Data Store (CDS):

  • https://cds.climate.copernicus.eu/cdsapp#!/home

  • https://climetlab.readthedocs.io/en/latest/

  • https://github.com/ecmwf/cdsapi/

obs = (
    climetlab.load_source(
        "cds",
        "reanalysis-era5-single-levels",
        product_type="reanalysis",
        time=["00:00", "06:00", "12:00", "18:00"],
        grid=[1.0, 1.0],
        param="2t",
        date=[
            "2020-08-29",
            "2020-08-30",
            "2020-08-31",
            "2020-09-01",
            "2020-09-02",
            "2020-09-03",
        ],
    )
    .to_xarray()
    .squeeze(drop=True)
).drop_vars("valid_time")
# climetlab or cds enable logging.INFO
import logging

logger = logging.getLogger()
logger.setLevel(logging.ERROR)

Forecast skill verification

with HindcastEnsemble

alignment = "same_inits"
hindcast = climpred.HindcastEnsemble(init.drop_vars("valid_time")).add_observations(obs)
/Users/aaron.spring/Coding/climpred/climpred/checks.py:233: UserWarning: Did not find dimension "init", but renamed dimension time with CF-complying standard_name "forecast_reference_time" to init.
  warnings.warn(
/Users/aaron.spring/Coding/climpred/climpred/checks.py:233: UserWarning: Did not find dimension "lead", but renamed dimension step with CF-complying standard_name "forecast_period" to lead.
  warnings.warn(
# still experimental, help appreciated https://github.com/pangeo-data/climpred/issues/605
# with climpred.set_options(seasonality='month'):
#     hindcast = hindcast.remove_bias(alignment=alignment, cross_validate=False, how='mean')
skill = hindcast.isel(lead=range(1, 9)).verify(
    metric="crps", comparison="m2o", alignment=alignment, dim=["init", "member"]
)
# zooming into north america
skill.sel(longitude=slice(200, 320), latitude=slice(70, 15)).t2m.plot(
    col="lead", col_wrap=4, robust=True, aspect=2.5
)
<xarray.plot.facetgrid.FacetGrid at 0x147d72d00>
../../_images/NWP_GEFS_6h_forecasts_24_1.png