From 868ed98492370ed92ede7b534e3ee6c5af5bf054 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Fri, 21 Nov 2025 10:49:58 +0000 Subject: [PATCH 1/7] Add IFS uncompressed data --- README.md | 2 +- pyproject.toml | 4 +- .../data_loader/datasets/all.py | 1 + .../data_loader/datasets/ifs_uncompressed.py | 172 ++++++++++++++++++ 4 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py diff --git a/README.md b/README.md index 207819e..9eb9493 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ To download all the data used for the benchmark run the following commands: ```bash uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci uv run python -m climatebenchpress.data_loader.datasets.cams -uv run python -m climatebenchpress.data_loader.datasets.era5 +uv run python -m climatebenchpress.data_loader.datasets.ifs_uncompressed uv run python -m climatebenchpress.data_loader.datasets.nextgems uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos diff --git a/pyproject.toml b/pyproject.toml index b5418dd..d6b9e78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,9 @@ dependencies = [ "cf-xarray~=0.10.0", "cftime~=1.6.0", "dask>=2024.12.0,<2025.4", + "earthkit-regrid>=0.5.0", "fsspec>=2024.10.0,<2025.4", + "gribscan>=0.0.14", "healpy~=1.18.0", # These versions need to be pinned to be compatible with the NextGEMS # catalog at https://data.nextgems-h2020.eu/online.yaml. @@ -52,5 +54,5 @@ where = ["src"] addopts = ["--import-mode=importlib"] [[tool.mypy.overrides]] -module = ["fsspec.*", "intake.*", "healpy.*"] +module = ["fsspec.*", "intake.*", "healpy.*", "earthkit.*"] follow_untyped_imports = true diff --git a/src/climatebenchpress/data_loader/datasets/all.py b/src/climatebenchpress/data_loader/datasets/all.py index 41d69d2..0f3700e 100644 --- a/src/climatebenchpress/data_loader/datasets/all.py +++ b/src/climatebenchpress/data_loader/datasets/all.py @@ -4,4 +4,5 @@ from .cmip6.all import * from .era5 import * from .esa_biomass_cci import * +from .ifs_uncompressed import * from .nextgems import * diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py new file mode 100644 index 0000000..666fce5 --- /dev/null +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -0,0 +1,172 @@ +__all__ = ["IFSUncompressedDataset"] + +import argparse +from pathlib import Path + +import earthkit.regrid +import numpy as np +import requests +import xarray as xr + +from .. import ( + monitor, + open_downloaded_canonicalized_dataset, + open_downloaded_tiny_canonicalized_dataset, +) +from .abc import Dataset + +BASE_URL = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket" + + +class IFSUncompressedDataset(Dataset): + """Dataset for IFS uncompressed data. + + Contains data from the [hplp](https://apps.ecmwf.int/ifs-experiments/rd/hplp/) + experiment from the Integrated Forecasting System (IFS) model. Crucially, + this dataset contains uncompressed 64-bit floating point data. + """ + + name = "ifs-uncompressed" + + @staticmethod + def download(download_path: Path, progress: bool = True): + ds = load_hplp_data(leveltype="sfc", gridtype="reduced_gg") + ds = ds[["msl", "10u", "10v"]] + ds_regridded = regrid_to_regular( + ds, + in_grid={"grid": "O400"}, + out_grid={"grid": [0.25, 0.25]}, + ) + downloadfile = download_path / "ifs_uncompressed.zarr" + with monitor.progress_bar(progress): + ds_regridded.to_zarr( + downloadfile, mode="w", encoding=dict(), compute=False + ).compute() + + @staticmethod + def open(download_path: Path) -> xr.Dataset: + ds = xr.open_dataset(download_path / "ifs_uncompressed.zarr") + + # Needed to make the dataset CF-compliant. + ds.longitude.attrs["axis"] = "X" + ds.latitude.attrs["axis"] = "Y" + ds.time.attrs["standard_name"] = "time" + return ds + + +def load_hplp_data(leveltype=None, gridtype=None, step=None, remap=False): + """Function taken from: https://github.com/climet-eu/compression-lab-notebooks/blob/main/04-example-datasets/01-hplp.ipynb.""" + if leveltype not in {"pl", "ml", "sfc", "wave"}: + raise ValueError( + f"Invalid leveltype: '{leveltype}'. Available leveltypes: pl, ml, sfc, wave" + ) + + if leveltype in {"ml", "pl"} and not gridtype: + raise ValueError( + f"Gridtype is required for leveltype '{leveltype}'. Available: reduced_gg, sh" + ) + + if remap and gridtype != "sh": + raise ValueError("Only 'sh' fields can be remapped.") + + if leveltype == "wave" and gridtype != "reduced_ll": + print("Warning: Wave model data are stored on a reduced_ll grid.") + + if leveltype == "sfc" and gridtype != "reduced_gg": + print("Warning: Surface level data are stored on a reduced_gg grid.") + + if step and not (leveltype == "ml" and gridtype == "reduced_gg"): + print( + "Warning: Specifying 'step' is unnecessary for this configuration and will be ignored." + ) + + if leveltype in {"sfc", "wave"}: + url = f"{BASE_URL}/hplp/hplp_{leveltype}.grib" + elif leveltype == "ml" and gridtype == "reduced_gg": + if step is None: + raise ValueError( + "The ml reduced_gg data are split into two parts:\n" + " - Steps: 0, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120 (2020-07-21T00:00:00 to 2020-07-26T00:00:00)\n" + " - Steps: 132, 144, 156, 168, 180, 192, 204, 216, 228, 240 (2020-07-26T12:00:00 to 2020-07-31T00:00:00)\n" + "Specify a step smaller than 120 for accessing the first part, \n" + "and a step greater or equal to 132 for accessing the second part." + ) + if step <= 120: + url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_0_120.grib" + else: + url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_132_240.grib" + else: + url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}" + ( + "_O400.grib" if remap else ".grib" + ) + ref = requests.get(f"{url}.ref").json() + + print(f"Loading dataset {url}") + + return xr.open_dataset( + "reference://", + engine="zarr", + backend_kwargs=dict(storage_options=dict(fo=ref, asynchronous=False)), + consolidated=False, + ) + + +def regrid_to_regular(ds, in_grid, out_grid): + """Regrid dataset to a regular lat-lon grid. + + Parameters + ---------- + ds : xr.Dataset + The input dataset to regrid + in_grid : dict + The input grid specification for earthkit.regrid.interpolate + out_grid : dict + The output grid specification for earthkit.regrid.interpolate. Is assumed to be + a regular lat-lon grid with equal spacing in latitude and longitude, e.g. {"grid": [0.25, 0.25]}. + """ + out_data = {var: [] for var in ds.data_vars} + for var in ds.data_vars: + for time in ds.time: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + out_data[var].append(r) + + dx = out_grid["grid"][0] + assert ( + out_grid["grid"][0] == out_grid["grid"][1] + ), "Only grids with equal latitude and longitude spacing are supported." + lats = np.linspace(90, -90, int(180 / dx) + 1) + lons = np.linspace(0, 360 - dx, int(360 / dx)) + coords = { + "time": ds.time, + "latitude": lats, + "longitude": lons, + } + out_ds = xr.Dataset( + { + var: (("time", "latitude", "longitude"), out_data[var]) + for var in ds.data_vars + }, + coords=coords, + ) + return out_ds + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--basepath", type=Path, default=Path()) + args = parser.parse_args() + + ds = open_downloaded_canonicalized_dataset( + IFSUncompressedDataset, basepath=args.basepath + ) + open_downloaded_tiny_canonicalized_dataset( + IFSUncompressedDataset, basepath=args.basepath + ) + + for v, da in ds.items(): + print(f"- {v}: {da.dims}") From 6b42c146ccf2c832f507163d184e0248d4053edf Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 24 Nov 2025 15:10:42 +0000 Subject: [PATCH 2/7] Add IFS humidity data set --- README.md | 1 + .../data_loader/datasets/ifs_humidity.py | 69 +++++++++++++++++++ .../data_loader/datasets/ifs_uncompressed.py | 50 +++++++++----- 3 files changed, 103 insertions(+), 17 deletions(-) create mode 100644 src/climatebenchpress/data_loader/datasets/ifs_humidity.py diff --git a/README.md b/README.md index 9eb9493..50613e1 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ To download all the data used for the benchmark run the following commands: uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci uv run python -m climatebenchpress.data_loader.datasets.cams uv run python -m climatebenchpress.data_loader.datasets.ifs_uncompressed +uv run python -m climatebenchpress.data_loader.datasets.ifs_humidity uv run python -m climatebenchpress.data_loader.datasets.nextgems uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos diff --git a/src/climatebenchpress/data_loader/datasets/ifs_humidity.py b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py new file mode 100644 index 0000000..2be47e6 --- /dev/null +++ b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py @@ -0,0 +1,69 @@ +__all__ = ["IFSUncompressedDataset"] + +import argparse +from pathlib import Path + +import xarray as xr + +from .. import ( + monitor, + open_downloaded_canonicalized_dataset, + open_downloaded_tiny_canonicalized_dataset, +) +from .abc import Dataset +from .ifs_uncompressed import load_hplp_data, regrid_to_regular + +BASE_URL = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket" + + +class IFSHumidityDataset(Dataset): + """Dataset for the humidity field of the uncompressed IFS data. + + Contains data from the [hplp](https://apps.ecmwf.int/ifs-experiments/rd/hplp/) + experiment from the Integrated Forecasting System (IFS) model. Crucially, + this dataset contains uncompressed 64-bit floating point data. + """ + + name = "ifs-humidity" + + @staticmethod + def download(download_path: Path, progress: bool = True): + ds = load_hplp_data(leveltype="ml", gridtype="reduced_gg", step=0) + ds = ds[["q"]] + ds_regridded = regrid_to_regular( + ds, + in_grid={"grid": "O400"}, + out_grid={"grid": [0.25, 0.25]}, + ) + downloadfile = download_path / "ifs_humidity.zarr" + with monitor.progress_bar(progress): + ds_regridded.to_zarr( + downloadfile, mode="w", encoding=dict(), compute=False + ).compute() + + @staticmethod + def open(download_path: Path) -> xr.Dataset: + ds = xr.open_dataset(download_path / "ifs_humidity.zarr") + + # Needed to make the dataset CF-compliant. + ds.longitude.attrs["axis"] = "X" + ds.latitude.attrs["axis"] = "Y" + ds.level.attrs["axis"] = "Z" + ds.time.attrs["standard_name"] = "time" + return ds + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--basepath", type=Path, default=Path()) + args = parser.parse_args() + + ds = open_downloaded_canonicalized_dataset( + IFSHumidityDataset, basepath=args.basepath + ) + open_downloaded_tiny_canonicalized_dataset( + IFSHumidityDataset, basepath=args.basepath + ) + + for v, da in ds.items(): + print(f"- {v}: {da.dims}") diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py index 666fce5..08daa07 100644 --- a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -126,19 +126,32 @@ def regrid_to_regular(ds, in_grid, out_grid): """ out_data = {var: [] for var in ds.data_vars} for var in ds.data_vars: + var_has_level = "level" in ds[var].dims for time in ds.time: - r = earthkit.regrid.interpolate( - ds[var].sel(time=time).values, - in_grid=in_grid, - out_grid=out_grid, - method="linear", - ) - out_data[var].append(r) + if var_has_level: + level_data = [] + for level in ds[var].level: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time, level=level).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + level_data.append(r) + out_data[var].append(level_data) + else: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + out_data[var].append(r) dx = out_grid["grid"][0] - assert ( - out_grid["grid"][0] == out_grid["grid"][1] - ), "Only grids with equal latitude and longitude spacing are supported." + assert out_grid["grid"][0] == out_grid["grid"][1], ( + "Only grids with equal latitude and longitude spacing are supported." + ) lats = np.linspace(90, -90, int(180 / dx) + 1) lons = np.linspace(0, 360 - dx, int(360 / dx)) coords = { @@ -146,13 +159,16 @@ def regrid_to_regular(ds, in_grid, out_grid): "latitude": lats, "longitude": lons, } - out_ds = xr.Dataset( - { - var: (("time", "latitude", "longitude"), out_data[var]) - for var in ds.data_vars - }, - coords=coords, - ) + + data_vars = {} + for var in ds.data_vars: + if "level" in ds[var].dims: + coords["level"] = ds[var].level + data_vars[var] = (("time", "level", "latitude", "longitude"), out_data[var]) + else: + data_vars[var] = (("time", "latitude", "longitude"), out_data[var]) + + out_ds = xr.Dataset(data_vars, coords=coords) return out_ds From 420630f491b37e757f27d066ddd3b173368e757e Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Tue, 25 Nov 2025 12:01:29 +0000 Subject: [PATCH 3/7] Add ecmwf-api-client --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index d6b9e78..fe4212e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "cftime~=1.6.0", "dask>=2024.12.0,<2025.4", "earthkit-regrid>=0.5.0", + "ecmwf-api-client>=1.6.5", "fsspec>=2024.10.0,<2025.4", "gribscan>=0.0.14", "healpy~=1.18.0", From 0df4b6df078411b9eaa87c8b5bc0cba86b4b16f2 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Wed, 26 Nov 2025 10:27:56 +0000 Subject: [PATCH 4/7] Change CAMS source dataset --- .../data_loader/datasets/cams.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/climatebenchpress/data_loader/datasets/cams.py b/src/climatebenchpress/data_loader/datasets/cams.py index a176c8a..26346ac 100644 --- a/src/climatebenchpress/data_loader/datasets/cams.py +++ b/src/climatebenchpress/data_loader/datasets/cams.py @@ -13,7 +13,9 @@ from ..download import _download_netcdf from .abc import Dataset -NO2_FILE = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket/CAMS/eac4-plev-no2-2023.nc" +NO2_FILE = ( + "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket/CAMS_hej6/cams_no2.nc" +) NUM_RETRIES = 3 @@ -41,15 +43,18 @@ def download(download_path: Path, progress: bool = True): @staticmethod def open(download_path: Path) -> xr.Dataset: - ds = xr.open_dataset(download_path / Path(NO2_FILE).name) + ds = xr.open_dataset(download_path / Path(NO2_FILE).name).chunk(-1) - # Restrict data to a single day. - # The specific day is arbitrary. - ds = ds.sel(valid_time=slice("2023-06-15", "2023-06-15")).chunk(-1) + # valid_time contains actual dates, whereas step is the seconds (in simulated time) + # since the model as been initialised. + ds = ds.assign_coords(valid_time=("step", ds.valid_time.data)) + ds = ds.swap_dims({"step": "valid_time"}) + ds = ds.reset_coords("step", drop=True) # Needed to make the dataset CF-compliant. ds.longitude.attrs["axis"] = "X" ds.latitude.attrs["axis"] = "Y" - ds.pressure_level.attrs["axis"] = "Z" + ds.hybrid.attrs["axis"] = "Z" + ds.valid_time.attrs["axis"] = "T" return ds From 29166f2d8915be3197be03de3cec09d22007124d Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 15 Dec 2025 15:26:03 +0000 Subject: [PATCH 5/7] Formatting --- .../data_loader/datasets/ifs_uncompressed.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py index 8424f03..3e0a7d8 100644 --- a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -149,9 +149,9 @@ def regrid_to_regular(ds, in_grid, out_grid): out_data[var].append(r) dx = out_grid["grid"][0] - assert out_grid["grid"][0] == out_grid["grid"][1], ( - "Only grids with equal latitude and longitude spacing are supported." - ) + assert ( + out_grid["grid"][0] == out_grid["grid"][1] + ), "Only grids with equal latitude and longitude spacing are supported." lats = np.linspace(90, -90, int(180 / dx) + 1) lons = np.linspace(0, 360 - dx, int(360 / dx)) coords = { From 008a186157f113ccd02f339d23b43314b1e65390 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 15 Dec 2025 15:28:28 +0000 Subject: [PATCH 6/7] Remove unused dependency --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b9e1523..c42f0f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ dependencies = [ "cftime~=1.6.0", "dask>=2024.12.0,<2025.4", "earthkit-regrid~=0.5.0", - "ecmwf-api-client~=1.6.5", "fsspec>=2024.10.0,<2025.4", "gribscan~=0.0.14", "healpy~=1.18.0", From 78d1e7b1b69baa25805c0310158b0b57dee7ca1d Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 15 Dec 2025 15:33:03 +0000 Subject: [PATCH 7/7] Update docstring --- src/climatebenchpress/data_loader/datasets/cams.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/climatebenchpress/data_loader/datasets/cams.py b/src/climatebenchpress/data_loader/datasets/cams.py index 26346ac..5f84631 100644 --- a/src/climatebenchpress/data_loader/datasets/cams.py +++ b/src/climatebenchpress/data_loader/datasets/cams.py @@ -22,9 +22,9 @@ class CamsNitrogenDioxideDataset(Dataset): """Dataset for CAMS Nitrogen Dioxide data. - The dataset comes from the - [Copernicus Atmosphere Monitoring Service (CAMS)](https://atmosphere.copernicus.eu/). - This particular class downloads Nitrogen Dioxide reanalysis data. + Uses data from the CAMS model published at [https://apps.ecmwf.int/ifs-experiments/rd/hej6/]. + This is output of a run of the CAMS model that is stored in single-precision + floating point data without any linear packing. """ name = "cams-nitrogen-dioxide"