From 9595e3f88cab820569b51c9360f9a87252f4caf9 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Fri, 21 Nov 2025 10:49:58 +0000 Subject: [PATCH 1/3] Add IFS uncompressed data --- pyproject.toml | 6 ++++++ .../data_loader/datasets/ifs_uncompressed.py | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c42f0f7..683be92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,9 +8,15 @@ dependencies = [ "cf-xarray~=0.10.0", "cftime~=1.6.0", "dask>=2024.12.0,<2025.4", +<<<<<<< HEAD "earthkit-regrid~=0.5.0", "fsspec>=2024.10.0,<2025.4", "gribscan~=0.0.14", +======= + "earthkit-regrid>=0.5.0", + "fsspec>=2024.10.0,<2025.4", + "gribscan>=0.0.14", +>>>>>>> 868ed98 (Add IFS uncompressed data) "healpy~=1.18.0", # These versions need to be pinned to be compatible with the NextGEMS # catalog at https://data.nextgems-h2020.eu/online.yaml. diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py index adece58..67565b4 100644 --- a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -136,9 +136,9 @@ def regrid_to_regular(ds, in_grid, out_grid): out_data[var].append(r) dx = out_grid["grid"][0] - assert ( - out_grid["grid"][0] == out_grid["grid"][1] - ), "Only grids with equal latitude and longitude spacing are supported." + assert out_grid["grid"][0] == out_grid["grid"][1], ( + "Only grids with equal latitude and longitude spacing are supported." + ) lats = np.linspace(90, -90, int(180 / dx) + 1) lons = np.linspace(0, 360 - dx, int(360 / dx)) coords = { From 57755337baa5366df0d4a6d656a3913ca13df457 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 24 Nov 2025 15:10:42 +0000 Subject: [PATCH 2/3] Add IFS humidity data set --- README.md | 1 + pyproject.toml | 6 -- .../data_loader/datasets/all.py | 1 + .../data_loader/datasets/ifs_humidity.py | 74 +++++++++++++++++++ .../data_loader/datasets/ifs_uncompressed.py | 50 ++++++++----- 5 files changed, 109 insertions(+), 23 deletions(-) create mode 100644 src/climatebenchpress/data_loader/datasets/ifs_humidity.py diff --git a/README.md b/README.md index 9eb9493..50613e1 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ To download all the data used for the benchmark run the following commands: uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci uv run python -m climatebenchpress.data_loader.datasets.cams uv run python -m climatebenchpress.data_loader.datasets.ifs_uncompressed +uv run python -m climatebenchpress.data_loader.datasets.ifs_humidity uv run python -m climatebenchpress.data_loader.datasets.nextgems uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos diff --git a/pyproject.toml b/pyproject.toml index 683be92..c42f0f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,15 +8,9 @@ dependencies = [ "cf-xarray~=0.10.0", "cftime~=1.6.0", "dask>=2024.12.0,<2025.4", -<<<<<<< HEAD "earthkit-regrid~=0.5.0", "fsspec>=2024.10.0,<2025.4", "gribscan~=0.0.14", -======= - "earthkit-regrid>=0.5.0", - "fsspec>=2024.10.0,<2025.4", - "gribscan>=0.0.14", ->>>>>>> 868ed98 (Add IFS uncompressed data) "healpy~=1.18.0", # These versions need to be pinned to be compatible with the NextGEMS # catalog at https://data.nextgems-h2020.eu/online.yaml. diff --git a/src/climatebenchpress/data_loader/datasets/all.py b/src/climatebenchpress/data_loader/datasets/all.py index 0f3700e..8b7361a 100644 --- a/src/climatebenchpress/data_loader/datasets/all.py +++ b/src/climatebenchpress/data_loader/datasets/all.py @@ -4,5 +4,6 @@ from .cmip6.all import * from .era5 import * from .esa_biomass_cci import * +from .ifs_humidity import * from .ifs_uncompressed import * from .nextgems import * diff --git a/src/climatebenchpress/data_loader/datasets/ifs_humidity.py b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py new file mode 100644 index 0000000..91c2dfd --- /dev/null +++ b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py @@ -0,0 +1,74 @@ +__all__ = ["IFSHumidityDataset"] + +import argparse +from pathlib import Path + +import xarray as xr + +from .. import ( + monitor, + open_downloaded_canonicalized_dataset, + open_downloaded_tiny_canonicalized_dataset, +) +from .abc import Dataset +from .ifs_uncompressed import load_hplp_data, regrid_to_regular + +BASE_URL = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket" + + +class IFSHumidityDataset(Dataset): + """Dataset for the humidity field of the uncompressed IFS data. + + Contains data from the [hplp](https://apps.ecmwf.int/ifs-experiments/rd/hplp/) + experiment from the Integrated Forecasting System (IFS) model. Crucially, + this dataset contains uncompressed 64-bit floating point data. + """ + + name = "ifs-humidity" + + @staticmethod + def download(download_path: Path, progress: bool = True): + donefile = download_path / "download.done" + if donefile.exists(): + return + + ds = load_hplp_data(leveltype="ml", gridtype="reduced_gg", step=0) + ds = ds[["q"]] + ds_regridded = regrid_to_regular( + ds, + in_grid={"grid": "O400"}, + out_grid={"grid": [0.25, 0.25]}, + ) + downloadfile = download_path / "ifs_humidity.zarr" + with monitor.progress_bar(progress): + ds_regridded.to_zarr( + downloadfile, mode="w", encoding=dict(), compute=False + ).compute() + + @staticmethod + def open(download_path: Path) -> xr.Dataset: + ds = xr.open_dataset(download_path / "ifs_humidity.zarr") + ds = ds.isel(time=slice(0, 1)).chunk(-1) + + # Needed to make the dataset CF-compliant. + ds.longitude.attrs["axis"] = "X" + ds.latitude.attrs["axis"] = "Y" + ds.level.attrs["axis"] = "Z" + ds.time.attrs["standard_name"] = "time" + return ds + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--basepath", type=Path, default=Path()) + args = parser.parse_args() + + ds = open_downloaded_canonicalized_dataset( + IFSHumidityDataset, basepath=args.basepath + ) + open_downloaded_tiny_canonicalized_dataset( + IFSHumidityDataset, basepath=args.basepath + ) + + for v, da in ds.items(): + print(f"- {v}: {da.dims}") diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py index 67565b4..3e0a7d8 100644 --- a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -126,19 +126,32 @@ def regrid_to_regular(ds, in_grid, out_grid): """ out_data = {var: [] for var in ds.data_vars} for var in ds.data_vars: + var_has_level = "level" in ds[var].dims for time in ds.time: - r = earthkit.regrid.interpolate( - ds[var].sel(time=time).values, - in_grid=in_grid, - out_grid=out_grid, - method="linear", - ) - out_data[var].append(r) + if var_has_level: + level_data = [] + for level in ds[var].level: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time, level=level).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + level_data.append(r) + out_data[var].append(level_data) + else: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + out_data[var].append(r) dx = out_grid["grid"][0] - assert out_grid["grid"][0] == out_grid["grid"][1], ( - "Only grids with equal latitude and longitude spacing are supported." - ) + assert ( + out_grid["grid"][0] == out_grid["grid"][1] + ), "Only grids with equal latitude and longitude spacing are supported." lats = np.linspace(90, -90, int(180 / dx) + 1) lons = np.linspace(0, 360 - dx, int(360 / dx)) coords = { @@ -146,13 +159,16 @@ def regrid_to_regular(ds, in_grid, out_grid): "latitude": lats, "longitude": lons, } - out_ds = xr.Dataset( - { - var: (("time", "latitude", "longitude"), out_data[var]) - for var in ds.data_vars - }, - coords=coords, - ) + + data_vars = {} + for var in ds.data_vars: + if "level" in ds[var].dims: + coords["level"] = ds[var].level + data_vars[var] = (("time", "level", "latitude", "longitude"), out_data[var]) + else: + data_vars[var] = (("time", "latitude", "longitude"), out_data[var]) + + out_ds = xr.Dataset(data_vars, coords=coords) return out_ds From ab331ea9a85fbfd3421c080676168eed3f43c582 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 15 Dec 2025 15:01:06 +0000 Subject: [PATCH 3/3] Break down datasets into 2 chunks --- .../data_loader/datasets/ifs_humidity.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/climatebenchpress/data_loader/datasets/ifs_humidity.py b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py index 91c2dfd..8f51d5c 100644 --- a/src/climatebenchpress/data_loader/datasets/ifs_humidity.py +++ b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py @@ -13,8 +13,6 @@ from .abc import Dataset from .ifs_uncompressed import load_hplp_data, regrid_to_regular -BASE_URL = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket" - class IFSHumidityDataset(Dataset): """Dataset for the humidity field of the uncompressed IFS data. @@ -48,7 +46,15 @@ def download(download_path: Path, progress: bool = True): @staticmethod def open(download_path: Path) -> xr.Dataset: ds = xr.open_dataset(download_path / "ifs_humidity.zarr") - ds = ds.isel(time=slice(0, 1)).chunk(-1) + num_levels = ds["level"].size + ds = ds.isel(time=slice(0, 1)).chunk( + { + "latitude": -1, + "longitude": -1, + "time": -1, + "level": (num_levels // 2) + 1, + } + ) # Needed to make the dataset CF-compliant. ds.longitude.attrs["axis"] = "X"