Skip to content

Commit

Permalink
expose example datasets through xdggs.tutorial (#84)
Browse files Browse the repository at this point in the history
* copy and adapt `xarray.tutorial`

* expose `xdggs.tutorial`

* add pooch to the required dependencies

* raise warnings from within `xdggs`

* tests for `tutorial.open_dataset`
  • Loading branch information
keewis authored Oct 25, 2024
1 parent 4b91288 commit 8a8d585
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 1 deletion.
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ dependencies = [
"lonboard>=0.9.3",
"pyproj>=3.3",
"matplotlib",
"arro3-core>=0.4.0"
"arro3-core>=0.4.0",
"pooch",
]

[project.urls]
Expand Down Expand Up @@ -101,3 +102,8 @@ branch = true
[tool.coverage.report]
show_missing = true
exclude_lines = ["pragma: no cover", "if TYPE_CHECKING"]

[tool.pytest.ini_options]
filterwarnings = [
"error:::xdggs.*",
]
1 change: 1 addition & 0 deletions xdggs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from importlib.metadata import PackageNotFoundError, version

import xdggs.tutorial # noqa: F401
from xdggs.accessor import DGGSAccessor # noqa: F401
from xdggs.h3 import H3Index
from xdggs.healpix import HealpixIndex
Expand Down
18 changes: 18 additions & 0 deletions xdggs/tests/test_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pytest

from xdggs import tutorial


@pytest.mark.parametrize(
["ds_name", "grid_name"],
(
("air_temperature", "h3"),
("air_temperature", "healpix"),
),
)
def test_download_from_github(tmp_path, ds_name, grid_name):
cache_dir = tmp_path / tutorial._default_cache_dir_name
ds = tutorial.open_dataset(ds_name, grid_name, cache_dir=cache_dir).load()

assert cache_dir.is_dir() and len(list(cache_dir.iterdir())) == 1
assert ds["air"].count() > 0
119 changes: 119 additions & 0 deletions xdggs/tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from __future__ import annotations

import os
import pathlib
from typing import TYPE_CHECKING

import pooch
from xarray import open_dataset as _open_dataset

if TYPE_CHECKING:
from xarray.backends.api import T_Engine

_default_cache_dir_name = "xdggs_tutorial_data"
base_url = "https://github.com/xdggs/xdggs-data"
version = "main"

external_urls = {} # type: dict
file_formats = {
"air_temperature": 4,
}


def _construct_cache_dir(path):
if isinstance(path, os.PathLike):
path = os.fspath(path)
elif path is None:
path = pooch.os_cache(_default_cache_dir_name)

return path


def _check_netcdf_engine_installed(name):
version = file_formats.get(name)
if version == 3:
try:
import scipy # noqa
except ImportError:
try:
import netCDF4 # noqa
except ImportError as err:
raise ImportError(
f"opening tutorial dataset {name} requires either scipy or "
"netCDF4 to be installed."
) from err
if version == 4:
try:
import h5netcdf # noqa
except ImportError:
try:
import netCDF4 # noqa
except ImportError as err:
raise ImportError(
f"opening tutorial dataset {name} requires either h5netcdf "
"or netCDF4 to be installed."
) from err


def open_dataset(
name: str,
grid_name: str,
*,
cache: bool = True,
cache_dir: None | str | os.PathLike = None,
engine: T_Engine = None,
**kws,
):
"""
Open a dataset from the online repository (requires internet).
If a local copy is found then always use that to avoid network traffic.
Available datasets:
* ``"air_temperature"`` (H3, healpix): NCEP reanalysis subset.
Parameters
----------
name : str
Name of the file containing the dataset.
e.g. 'air_temperature'
grid_name : str
Name of the grid file.
cache_dir : path-like, optional
The directory in which to search for and write cached data.
cache : bool, optional
If True, then cache data locally for use on subsequent calls
**kws : dict, optional
Passed to xarray.open_dataset
See Also
--------
xarray.tutorial.open_dataset
"""

logger = pooch.get_logger()
logger.setLevel("WARNING")

cache_dir = _construct_cache_dir(cache_dir)
if name in external_urls:
url = external_urls[name]
else:
path = pathlib.Path(grid_name)
if not path.suffix:
# process the name
default_extension = ".nc"
if engine is None:
_check_netcdf_engine_installed(grid_name)
path = path.with_suffix(default_extension)

url = f"{base_url}/raw/{version}/{name}/{path.name}"

# retrieve the file
filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir)
ds = _open_dataset(filepath, engine=engine, **kws)
if not cache:
ds = ds.load()
pathlib.Path(filepath).unlink()

return ds

0 comments on commit 8a8d585

Please sign in to comment.