From 58d15a6cee050da865f41416729bee418cda5f79 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 5 Feb 2025 18:36:33 -0600 Subject: [PATCH 1/4] Add initial version --- .github/workflows/build.yaml | 49 +++++++++ .github/workflows/do_not_merge.yaml | 41 ++++++++ .github/workflows/formatting.yaml | 11 ++ .github/workflows/lint.yaml | 16 +++ .github/workflows/mypy.yaml | 11 ++ .github/workflows/rebase_checker.yaml | 7 ++ .github/workflows/yamllint.yaml | 11 ++ .gitignore | 3 + .pre-commit-config.yaml | 28 ++++++ .yamllint.yaml | 20 ++++ pyproject.toml | 139 ++++++++++++++++++++++++++ python/lsst/__init__.py | 0 python/lsst/sdm_tools/__init__.py | 22 ++++ python/lsst/sdm_tools/cli.py | 36 +++++++ python/lsst/sdm_tools/mypy.ini | 23 +++++ requirements.txt | 2 + setup.cfg | 5 + types.txt | 3 + 18 files changed, 427 insertions(+) create mode 100644 .github/workflows/build.yaml create mode 100644 .github/workflows/do_not_merge.yaml create mode 100644 .github/workflows/formatting.yaml create mode 100644 .github/workflows/lint.yaml create mode 100644 .github/workflows/mypy.yaml create mode 100644 .github/workflows/rebase_checker.yaml create mode 100644 .github/workflows/yamllint.yaml create mode 100644 .pre-commit-config.yaml create mode 100644 .yamllint.yaml create mode 100644 pyproject.toml create mode 100644 python/lsst/__init__.py create mode 100644 python/lsst/sdm_tools/__init__.py create mode 100644 python/lsst/sdm_tools/cli.py create mode 100644 python/lsst/sdm_tools/mypy.ini create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 types.txt diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..5c70977 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,49 @@ +name: Build and test + +on: + push: + branches: + - main + tags: + - "*" + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + with: + # Need to clone everything for the git tags. + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install prereqs for setuptools + run: pip install wheel + + - name: Install dependencies + run: | + python -m pip install --upgrade pip uv + uv pip install --system -r requirements.txt + + - name: Install pytest packages + run: uv pip install --system pytest pytest-xdist pytest-cov + + - name: List installed packages + run: uv pip list + + - name: Build and install + run: uv pip install --system --no-deps -e . + + - name: Run tests + run: | + sdm-tools --help diff --git a/.github/workflows/do_not_merge.yaml b/.github/workflows/do_not_merge.yaml new file mode 100644 index 0000000..e3fae1a --- /dev/null +++ b/.github/workflows/do_not_merge.yaml @@ -0,0 +1,41 @@ +name: "Check commits can be merged" +on: + push: + branches: + - main + pull_request: + +jobs: + do-not-merge-checker: + runs-on: ubuntu-latest + + steps: + - name: Check that there are no commits that should not be merged + uses: gsactions/commit-message-checker@v2 + with: + excludeDescription: "true" # optional: this excludes the description body of a pull request + excludeTitle: "true" # optional: this excludes the title of a pull request + checkAllCommitMessages: "true" # optional: this checks all commits associated with a pull request + accessToken: ${{ secrets.GITHUB_TOKEN }} # github access token is only required if checkAllCommitMessages is true + # Check for message indicating that there is a commit that should + # not be merged. + pattern: ^(?!DO NOT MERGE) + flags: "i" + error: | + "This step failed because there is a commit containing the text + 'DO NOT MERGE'. Remove this commit from the branch before merging + or change the commit summary." + + - uses: actions/checkout@v4 + + - name: Check requirements.txt for branches + shell: bash + run: | + FILE="requirements.txt" + MATCH=tickets/DM- + if grep -q $MATCH $FILE + then + echo "Ticket branches found in $FILE:" + grep -n $MATCH $FILE + exit 1 + fi diff --git a/.github/workflows/formatting.yaml b/.github/workflows/formatting.yaml new file mode 100644 index 0000000..27f34a6 --- /dev/null +++ b/.github/workflows/formatting.yaml @@ -0,0 +1,11 @@ +name: Check Python formatting + +on: + push: + branches: + - main + pull_request: + +jobs: + call-workflow: + uses: lsst/rubin_workflows/.github/workflows/formatting.yaml@main diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 0000000..3f5e9b0 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,16 @@ +name: Lint Python code + +on: + push: + branches: + - main + pull_request: + +jobs: + call-workflow: + uses: lsst/rubin_workflows/.github/workflows/lint.yaml@main + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: chartboost/ruff-action@v1 diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml new file mode 100644 index 0000000..0849ea4 --- /dev/null +++ b/.github/workflows/mypy.yaml @@ -0,0 +1,11 @@ +name: Run mypy + +on: + push: + branches: + - main + pull_request: + +jobs: + call-workflow: + uses: lsst/rubin_workflows/.github/workflows/mypy.yaml@main diff --git a/.github/workflows/rebase_checker.yaml b/.github/workflows/rebase_checker.yaml new file mode 100644 index 0000000..65516d9 --- /dev/null +++ b/.github/workflows/rebase_checker.yaml @@ -0,0 +1,7 @@ +name: Check that 'main' is not merged into the development branch + +on: pull_request + +jobs: + call-workflow: + uses: lsst/rubin_workflows/.github/workflows/rebase_checker.yaml@main diff --git a/.github/workflows/yamllint.yaml b/.github/workflows/yamllint.yaml new file mode 100644 index 0000000..d0cdd57 --- /dev/null +++ b/.github/workflows/yamllint.yaml @@ -0,0 +1,11 @@ +name: Lint YAML files + +on: + push: + branches: + - main + pull_request: + +jobs: + call-workflow: + uses: lsst/rubin_workflows/.github/workflows/yamllint.yaml@main diff --git a/.gitignore b/.gitignore index 15201ac..6abe941 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,6 @@ cython_debug/ # PyPI configuration file .pypirc + +# lsst version file +python/lsst/sdm_tools/version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..808a440 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-yaml + args: + - "--unsafe" + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 24.4.2 + hooks: + - id: black + # It is recommended to specify the latest version of Python + # supported by your project here, or alternatively use + # pre-commit's default_language_version, see + # https://pre-commit.com/#top_level-default_language_version + language_version: python3.12 + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + name: isort (python) + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.4.8 + hooks: + - id: ruff diff --git a/.yamllint.yaml b/.yamllint.yaml new file mode 100644 index 0000000..10c9e71 --- /dev/null +++ b/.yamllint.yaml @@ -0,0 +1,20 @@ +extends: default + +rules: + document-start: + present: false + ignore: | + /tests/ + /examples/ + line-length: + max: 132 + allow-non-breakable-words: true + allow-non-breakable-inline-mappings: true + ignore: | + /.github/workflows/lint.yaml + truthy: + # "on" as a key in workflows confuses things + ignore: | + /.github/workflows/ + indentation: + indent-sequences: consistent diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1165e7a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,139 @@ +[build-system] +requires = ["setuptools", "lsst-versions >= 1.3.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "sdm_tools" +description = "Tools for working with the Science Data Model (SDM) schemas of the Rubin Observatory" +license = {text = "GNU General Public License v3 or later (GPLv3+)"} +readme = "README.md" +authors = [ + {name="Rubin Observatory Data Management", email="dm-admin@lists.lsst.org"}, +] +classifiers = [ + "Intended Audience :: Science/Research", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering :: Astronomy" +] +keywords = ["lsst"] +dependencies = [ + "lsst-felis", + "click" +] +requires-python = ">=3.11.0" +dynamic = ["version"] + +[project.urls] +Source = "https://github.com/lsst-dm/sdm_tools" + +[project.optional-dependencies] +test = [ + "pytest >= 3.2" +] + +[tool.setuptools.packages.find] +where = ["python"] + +[tool.setuptools] +zip-safe = true +license-files = ["COPYRIGHT", "LICENSE"] + +[tool.setuptools.package-data] +"sdm_tools" = ["py.typed"] + +[tool.setuptools.dynamic] +version = { attr = "lsst_versions.get_lsst_version" } + +[project.scripts] +sdm-tools = "lsst.sdm_tools.cli:cli" + +[tool.black] +line-length = 110 +target-version = ["py311"] + +[tool.isort] +profile = "black" +line_length = 110 + +[tool.lsst_versions] +write_to = "python/lsst/sdm_tools/version.py" + +[tool.ruff] +line-length = 110 +target-version = "py311" +exclude = [ + "__init__.py", + "lex.py", + "yacc.py", +] + +[tool.ruff.lint] +ignore = [ + "D100", + "D102", + "D104", + "D105", + "D107", + "D200", + "D203", + "D205", + "D213", + "D400", + "D413", + "N802", + "N803", + "N806", + "N812", + "N815", + "N816", + "N999", + "UP007", # Allow UNION in type annotation +] +select = [ + "E", # pycodestyle + "F", # pycodestyle + "N", # pep8-naming + "W", # pycodestyle + "D", # pydocstyle + "UP", # pyupgrade +] + +[tool.pydocstyle] +convention = "numpy" +# Our coding style does not require docstrings for magic methods (D105) +# Our docstyle documents __init__ at the class level (D107) +# We allow methods to inherit docstrings and this is not compatible with D102. +# Docstring at the very first line is not required +# D200, D205 and D400 all complain if the first sentence of the docstring does +# not fit on one line. We do not require docstrings in __init__ files (D104). +add-ignore = ["D107", "D105", "D102", "D100", "D200", "D205", "D400", "D104"] + +[tool.ruff.lint.pycodestyle] +max-doc-length = 79 + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.numpydoc_validation] +checks = [ + "all", # All except the rules listed below. + "SA01", # See Also section. + "SA04", # We don't use descriptions with See Also. + "EX01", # Example section. + "SS06", # Summary can go into second line. + "GL01", # Summary text can start on same line as """ + "GL08", # Do not require docstring. + "ES01", # No extended summary required. + "SS05", # pydocstyle is better at finding infinitive verb. + "PR04", # Sphinx does not require parameter type. +] +exclude = [ + "^test_.*", # Do not test docstrings in test code. + '^__init__$', + '\._[a-zA-Z_]+$', # Private methods. +] diff --git a/python/lsst/__init__.py b/python/lsst/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/lsst/sdm_tools/__init__.py b/python/lsst/sdm_tools/__init__.py new file mode 100644 index 0000000..1346885 --- /dev/null +++ b/python/lsst/sdm_tools/__init__.py @@ -0,0 +1,22 @@ +# This file is part of sdm-tools. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from .version import * diff --git a/python/lsst/sdm_tools/cli.py b/python/lsst/sdm_tools/cli.py new file mode 100644 index 0000000..1b8bb0f --- /dev/null +++ b/python/lsst/sdm_tools/cli.py @@ -0,0 +1,36 @@ +# This file is part of sdm-tools. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import click + +from . import __version__ + + +@click.group() +@click.version_option(__version__) +@click.pass_context +def cli(ctx: click.Context) -> None: + """SDM Tools Command Line Interface""" + ctx.ensure_object(dict) + + +if __name__ == "__main__": + cli() diff --git a/python/lsst/sdm_tools/mypy.ini b/python/lsst/sdm_tools/mypy.ini new file mode 100644 index 0000000..796f8b5 --- /dev/null +++ b/python/lsst/sdm_tools/mypy.ini @@ -0,0 +1,23 @@ +[mypy] +ignore_errors = False +warn_unused_configs = True +warn_redundant_casts = True +ignore_missing_imports = False +disallow_untyped_defs = True +disallow_incomplete_defs = True +plugins = pydantic.mypy + +[mypy-sqlalchemy.*] +ignore_missing_imports = True + +[mypy-deepdiff.*] +ignore_missing_imports = True + +[mypy-felis.*] +ignore_missing_imports = False +ignore_errors = False +disallow_untyped_defs = True +disallow_incomplete_defs = True +strict_equality = True +warn_unreachable = True +warn_unused_ignores = True diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2a7ca93 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +lsst-felis +click diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..a0396e9 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[flake8] +max-line-length = 110 +max-doc-length = 79 +ignore = E203, W503, N802, N803, N806, N812, N815, N816 +exclude = __init__.py diff --git a/types.txt b/types.txt new file mode 100644 index 0000000..e47ca69 --- /dev/null +++ b/types.txt @@ -0,0 +1,3 @@ +types-PyYAML +types-click +types-Deprecated From 328ade14b80545d72f7dc64f3b02ccbd74da8268 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 7 Feb 2025 15:08:57 -0600 Subject: [PATCH 2/4] Add cli command for building and packaging datalink metadata This was ported from: https://github.com/lsst/sdm_schemas/blob/main/datalink/build_datalink_metadata.py --- python/lsst/__init__.py | 20 +++ .../lsst/sdm_tools/build_datalink_metadata.py | 131 ++++++++++++++++++ python/lsst/sdm_tools/cli.py | 47 ++++++- 3 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 python/lsst/sdm_tools/build_datalink_metadata.py diff --git a/python/lsst/__init__.py b/python/lsst/__init__.py index e69de29..d82548c 100644 --- a/python/lsst/__init__.py +++ b/python/lsst/__init__.py @@ -0,0 +1,20 @@ +# This file is part of sdm_tools. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . diff --git a/python/lsst/sdm_tools/build_datalink_metadata.py b/python/lsst/sdm_tools/build_datalink_metadata.py new file mode 100644 index 0000000..cfb02cc --- /dev/null +++ b/python/lsst/sdm_tools/build_datalink_metadata.py @@ -0,0 +1,131 @@ +"""From the Felis source files, build YAML metadata used by DataLink. + +Currently, this only determines principal column names. In the future, once +a new key has been added to Felis, it will include other column lists, and +possibly additional metadata. +""" + +# This file is part of sdm_tools. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any + +import yaml + + +def filter_columns(table: dict[str, Any], filter_key: str) -> list[str]: + """Find the columns for a table with a given key. + + This respects the TAP v1.1 convention for ordering of columns. All + columns without ``tap:column_index`` set will be sorted after all those + with it set, in the order in which they appeared in the Felis file. + + Parameters + ---------- + table : Dict[`str`, Any] + Felis definition of a table. + filter_key : `str` + Felis key to use to find columns of interest. For example, use + ``tap:principal`` to find principal columns. + + Returns + ------- + columns : List[`str`] + List of filtered columns in sorted order. + """ + principal = [] + unknown_column_index = 100000000 + for column in table["columns"]: + if column.get(filter_key): + column_index = column.get("tap:column_index", unknown_column_index) + unknown_column_index += 1 + principal.append((column["name"], column_index)) + return [c[0] for c in sorted(principal, key=lambda c: c[1])] + + +def build_columns(felis: dict[str, Any], column_properties: list[str]) -> dict[str, dict[str, list[str]]]: + """Find the list of tables with a particular Felis property. + + Parameters + ---------- + felis : Dict[`str`, Any] + The parsed Felis YAML. + column_properties : `str` + The column properties to search for. + """ + schema = felis["name"] + output: dict[str, dict[str, list[str]]] = defaultdict(dict) + for table in felis["tables"]: + name = table["name"] + full_name = f"{schema}.{name}" + for column_property in column_properties: + columns = filter_columns(table, column_property) + output[full_name][column_property] = columns + return output + + +def process_files(files: list[Path], output_path: Path | None = None) -> None: + """Process a set of Felis input files and print output to standard out. + + Parameters + ---------- + files : List[`pathlib.Path`] + List of input files. + + Output + ------ + The YAML version of the output format will look like this: + + .. code-block:: yaml + + tables: + dp02_dc2_catalogs.ForcedSourceOnDiaObject: + tap:principal: + - band + - ccdVisitId + """ + tables = {} + for input_file in files: + with input_file.open("r") as fh: + felis = yaml.safe_load(fh) + tables.update(build_columns(felis, ["tap:principal"])) + + # Dump the result to the output stream. + if output_path is None: + print(yaml.dump({"tables": tables}), file=sys.stdout) + else: + + with output_path.open("w") as output: + print(yaml.dump({"tables": tables}), file=output) + + +def main() -> None: + """Script entry point.""" + process_files([Path(f) for f in sys.argv[1:]]) + + +if __name__ == "__main__": + main() diff --git a/python/lsst/sdm_tools/cli.py b/python/lsst/sdm_tools/cli.py index 1b8bb0f..47ee678 100644 --- a/python/lsst/sdm_tools/cli.py +++ b/python/lsst/sdm_tools/cli.py @@ -1,4 +1,4 @@ -# This file is part of sdm-tools. +# This file is part of sdm_tools. # # Developed for the LSST Data Management System. # This product includes software developed by the LSST Project @@ -19,9 +19,15 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import zipfile +from pathlib import Path + import click from . import __version__ +from . import build_datalink_metadata as _build_datalink_metadata + +__all__ = ["cli"] @click.group() @@ -32,5 +38,44 @@ def cli(ctx: click.Context) -> None: ctx.ensure_object(dict) +@cli.command("build-datalink-metadata", help="Build Datalink metadata from Felis YAML files") +@click.argument("files", type=click.Path(exists=True), nargs=-1, required=True) +@click.option( + "--resource-dir", + type=click.Path(exists=True, file_okay=False), + default=".", + help="Directory to search for and write resources (DEFAULT: current directory)", +) +@click.option( + "--zip-dir", + type=click.Path(exists=True, file_okay=False), + default=".", + help="Directory to write zip files (DEFAULT: current directory)", +) +@click.pass_context +def build_datalink_metadata(ctx: click.Context, files: list[str], resource_dir: str, zip_dir: str) -> None: + """Build Datalink Metadata + + Build a collection of configuration files for datalinker that specify the + principal and minimal columns for tables. This temporarily only does + tap:principal and we hand-maintain a columns-minimal.yaml file until we can + include a new key in the Felis input files. + """ + resource_path = Path(resource_dir) + + paths = [Path(file) for file in files] + _build_datalink_metadata.process_files(paths, Path(resource_path / "columns-principal.yaml")) + + zip_path = Path(zip_dir) + with zipfile.ZipFile(zip_path / "datalink-columns.zip", "w") as columns_zip: + for yaml_file in resource_path.glob("columns-*.yaml"): + columns_zip.write(yaml_file, yaml_file.name) + with zipfile.ZipFile(zip_path / "datalink-snippets.zip", "w") as snippets_zip: + for snippet_file in resource_path.glob("*.json"): + snippets_zip.write(snippet_file, snippet_file.name) + for snippet_file in resource_path.glob("*.xml"): + snippets_zip.write(snippet_file, snippet_file.name) + + if __name__ == "__main__": cli() From 4b0742198d85ec65c53c351e2a8b06c2a7e84101 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 13 Feb 2025 14:39:05 -0600 Subject: [PATCH 3/4] Add deepdiff dependency --- pyproject.toml | 3 ++- requirements.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1165e7a..952fb25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,8 @@ classifiers = [ keywords = ["lsst"] dependencies = [ "lsst-felis", - "click" + "click", + "deepdiff", ] requires-python = ">=3.11.0" dynamic = ["version"] diff --git a/requirements.txt b/requirements.txt index 2a7ca93..1e45e25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ lsst-felis click +deepdiff \ No newline at end of file From 5e43df2dfa6b870acdb76bfa65748edc4d1bdcbf Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 13 Feb 2025 14:40:00 -0600 Subject: [PATCH 4/4] WIP on band column checker --- python/lsst/sdm_tools/band_column_checker.py | 179 +++++++++++++++++++ python/lsst/sdm_tools/cli.py | 65 ++++++- 2 files changed, 243 insertions(+), 1 deletion(-) create mode 100644 python/lsst/sdm_tools/band_column_checker.py diff --git a/python/lsst/sdm_tools/band_column_checker.py b/python/lsst/sdm_tools/band_column_checker.py new file mode 100644 index 0000000..1753965 --- /dev/null +++ b/python/lsst/sdm_tools/band_column_checker.py @@ -0,0 +1,179 @@ +"""Check consistency of band column definitions.""" + +import logging +import pprint +import re + +from deepdiff.diff import DeepDiff +from typing import Any + +from felis.datamodel import Schema, Table + +logger = logging.getLogger("band_column_checker") +logger.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) + +BANDS = set(["u", "g", "r", "i", "z", "y"]) + + +class BandColumnChecker: + """Check consistency of band column definitions. + + Parameters + ---------- + files : list[str] + List of schema files to check. + table_names : list[str] + List of table names to check. If empty, all tables are checked. + """ + + def __init__(self, files: list[str], table_names: list[str]) -> None: + self.files = files + self.table_names = table_names + if len(self.table_names) > 0: + logger.debug(f"Checking tables: {self.table_names}") + self.schemas: dict[str, Schema] = {} + for file in files: + with open(file) as schema_file: + logger.debug(f"Reading schema file: {file}") + schema = Schema.from_stream(schema_file) + if schema.name in self.schemas: + raise ValueError(f"Duplicate schema name: {schema.name}") + self.schemas[schema.name] = schema + self._band_dict = self._create_band_dict() + + @property + def band_columns(self) -> dict[str, Any]: + """Return the band columns by schema and table.""" + return self._band_dict + + def _should_check_table(self, table_or_name: str | Table) -> bool: + """Check if the table or table name should be included in checks.""" + table_name = table_or_name if isinstance(table_or_name, str) else table_or_name.name + return len(self.table_names) == 0 or table_name in self.table_names + + def _create_band_dict(self) -> dict[str, Any]: + """Create a dictionary of band columns by schema and table.""" + band_columns: dict[str, Any] = {} + for schema_name, schema in self.schemas.items(): + band_columns[schema_name] = {} + for table in schema.tables: + if self._should_check_table(table): + band_columns[schema.name][table.name] = {} + for column in table.columns: + for band in BANDS: + if column.name.startswith(f"{band}_"): + if band not in band_columns[schema.name][table.name]: + band_columns[schema.name][table.name][band] = [] + band_columns[schema.name][table.name][band].append( + column.name.removeprefix(f"{band}_") + ) + if len(band_columns[schema.name][table.name]) == 0: + del band_columns[schema.name][table.name] + else: + logger.debug(f"Skipping table: {table.name}") + return band_columns + + def _create_table_band_dict(self, table: Table) -> dict[str, Any]: + """Create a dictionary of band columns for a single table.""" + band_columns: dict[str, Any] = {} + for column in table.columns: + for band in BANDS: + if column.name.startswith(f"{band}_"): + if band not in band_columns: + band_columns[band] = [] + raw_data = column.model_dump(exclude_none=True) + raw_data["name"] = raw_data["name"].removeprefix(f"{band}_") + raw_data["description"] = self._clean_column_description(raw_data["description"], band) + del raw_data["id"] # Remove the id field as they are always different + band_columns[band].append(raw_data) + return band_columns + + def _check_column_count(self) -> None: + """Check that number of columns in each band is the same.""" + for schema_name, columns in self.band_columns.items(): + for table_name, bands in columns.items(): + column_counts = {band: len(column_names) for band, column_names in bands.items()} + logger.info(f"'{schema_name}'.'{table_name}' column counts: %s", column_counts) + if len(set(column_counts.values())) > 1: + logger.error( + f"Inconsistent number of band columns in " + f"'{schema_name}'.'{table_name}': {column_counts}" + ) + + def _check_column_names(self) -> None: + """Check that the names of the band columns are the same.""" + for schema_name, tables in self.band_columns.items(): + logger.info("Checking column names for schema: %s", schema_name) + for table_name, bands in tables.items(): + # Check that each band has the same column names + reference_band, reference_columns = next(iter(bands.items())) + reference_set = set(reference_columns) + for band, column_names in bands.items(): + column_set = set(column_names) + if column_set == reference_set: + continue + logger.error( + f"In '{schema_name}'.'{table_name}', " + f"inconsistencies found between '{reference_band}' and '{band}'" + ) + logger.error( + f" In '{reference_band}' but not in '{band}': {sorted(reference_set - column_set)}" + ) + logger.error( + f" In '{band}' but not in '{reference_band}': {sorted(column_set - reference_set)}" + ) + logger.info("All band columns have the same names") + + def dump(self) -> None: + pprint.pprint(self.band_columns) + + def print(self) -> None: + """Print out band columns by schema and table.""" + for schema_name, columns in self.band_columns.items(): + print(f"Schema: {schema_name}") + for table_name, bands in columns.items(): + print(f" Table: {table_name}") + for band, column_names in bands.items(): + print(f" Band: {band}") + for column_name in column_names: + print(f" Column: {column_name}") + + def _clean_column_description(self, description: str, band: str) -> str: + """Replace band names in the column description with a placeholder.""" + description = re.sub(rf"\b{band}-", "[BAND]-", description) + description = re.sub(rf"\b{band}_", "[BAND]_", description) + return description + + def _diff(self) -> None: + for schema_name, schema in self.schemas.items(): + logger.debug(f"Running diff on schema: {schema_name}") + for table in schema.tables: + if self._should_check_table(table): + logger.debug(f"Running diff on table: {table.name}") + band_columns: dict[str, Any] = self._create_table_band_dict(table) + if len(band_columns) > 1: + self._diff_single_table(schema_name, table.name, band_columns) + else: # Skip table + logger.debug(f"Skipping table: {table.name}") + + def _diff_single_table(self, schema_name: str, table_name: str, band_columns: dict[str, Any]) -> None: + """Run diff comparison on the band columns for a single table.""" + reference_band, reference_columns = next(iter(band_columns.items())) + for compare_band, compare_columns in band_columns.items(): + if compare_band == reference_band: + continue + logger.debug(f"Comparing '{reference_band}' and '{compare_band}'") + diff = DeepDiff(reference_columns, compare_columns, ignore_order=True) + if len(diff) > 0: + logger.warning( + f"In '{schema_name}'.'{table_name}', differences found between" + f"'{reference_band}' and '{compare_band}': " + f"{diff}" + ) + + def check(self) -> None: + """Check consistency of band column definitions.""" + self._check_column_count() + self._check_column_names() + self._diff() diff --git a/python/lsst/sdm_tools/cli.py b/python/lsst/sdm_tools/cli.py index 47ee678..bb587b6 100644 --- a/python/lsst/sdm_tools/cli.py +++ b/python/lsst/sdm_tools/cli.py @@ -23,19 +23,42 @@ from pathlib import Path import click +import logging from . import __version__ from . import build_datalink_metadata as _build_datalink_metadata +from .band_column_checker import BandColumnChecker __all__ = ["cli"] +logger = logging.getLogger("sdm_tools") + +loglevel_choices = list(logging._nameToLevel.keys()) + @click.group() @click.version_option(__version__) +@click.option( + "--log-level", + type=click.Choice(loglevel_choices), + envvar="SDM_TOOLS_LOGLEVEL", + help="SDM Tools log level", + default=logging.getLevelName(logging.INFO), +) +@click.option( + "--log-file", + type=click.Path(), + envvar="SDM_TOOLS_LOGFILE", + help="SDM Tools log file path", +) @click.pass_context -def cli(ctx: click.Context) -> None: +def cli(ctx: click.Context, log_level: str, log_file: str | None) -> None: """SDM Tools Command Line Interface""" ctx.ensure_object(dict) + if log_file: + logging.basicConfig(filename=log_file, level=log_level) + else: + logging.basicConfig(level=log_level) @cli.command("build-datalink-metadata", help="Build Datalink metadata from Felis YAML files") @@ -77,5 +100,45 @@ def build_datalink_metadata(ctx: click.Context, files: list[str], resource_dir: snippets_zip.write(snippet_file, snippet_file.name) +def _parse_comma_separated(ctx, param, value): + if value: + return value.split(",") + return [] + + +@cli.command("check-band-columns", help="Check consistency of band column definitions") +@click.argument("files", type=click.Path(exists=True), nargs=-1, required=True) +@click.option("--print", "-p", "print_columns", is_flag=True, help="Print out the band columns") +@click.option("--dump", "-d", "dump", is_flag=True, help="Dump the raw band column data") +@click.option( + "--tables", + "-t", + "table_names", + callback=_parse_comma_separated, + help="Names of tables to check (comma-separated)", +) +@click.pass_context +def check_band_columns( + ctx: click.Context, + files: list[str], + print_columns: bool = False, + dump: bool = False, + table_names: list[str] = [], +) -> None: + """Build Datalink Metadata + + Build a collection of configuration files for datalinker that specify the + principal and minimal columns for tables. This temporarily only does + tap:principal and we hand-maintain a columns-minimal.yaml file until we can + include a new key in the Felis input files. + """ + checker = BandColumnChecker(files, table_names) + if dump: + checker.dump() + if print_columns: + checker.print() + checker.check() + + if __name__ == "__main__": cli()