From a1ff78e2e45a3aa4036c975fb8de35520057e0c0 Mon Sep 17 00:00:00 2001 From: Toby Jennings Date: Wed, 29 Jan 2025 10:49:25 -0600 Subject: [PATCH] feat(allocate): Allocate wms resources at end of daemon iteration --- src/lsst/cmservice/common/daemon.py | 79 ++++++++++++++++++++++++++- src/lsst/cmservice/common/htcondor.py | 14 +++-- src/lsst/cmservice/config.py | 29 ++++++++-- 3 files changed, 110 insertions(+), 12 deletions(-) diff --git a/src/lsst/cmservice/common/daemon.py b/src/lsst/cmservice/common/daemon.py index e897586d..1216528c 100644 --- a/src/lsst/cmservice/common/daemon.py +++ b/src/lsst/cmservice/common/daemon.py @@ -1,12 +1,16 @@ +import importlib.util +import os +import sys from datetime import datetime, timedelta from sqlalchemy.ext.asyncio import async_scoped_session from sqlalchemy.future import select -from ..common.logging import LOGGER from ..config import config from ..db.queue import Queue from ..db.script import Script +from .htcondor import build_htcondor_submit_environment +from .logging import LOGGER logger = LOGGER.bind(module=__name__) @@ -41,3 +45,76 @@ async def daemon_iteration(session: async_scoped_session) -> None: logger.exception() continue await session.commit() + + # Try to allocate resources at the end of the loop, but do not crash if it + # doesn't work. + try: + if config.daemon.allocate_resources: + allocate_resources() + except Exception: + logger.exception() + + +def allocate_resources() -> None: + """Allocate resources for htcondor jobs submitted during the daemon + iteration. + """ + if (htcondor := sys.modules.get("htcondor")) is not None: + pass + elif (importlib.util.find_spec("htcondor")) is not None: + htcondor = importlib.import_module("htcondor") + + if htcondor is None: + logger.warning("HTcondor not available, will not allocate resources") + return + + # Ensure environment is configured for htcondor operations + # FIXME: the python process needs the correct condor env set up. Alternate + # to setting these values JIT in the os.environ would be to hack a way to + # have the config.htcondor submodel's validation_alias match the + # serialization_alias, e.g., "_CONDOR_value" + condor_environment = config.htcondor.model_dump(by_alias=True) + os.environ |= condor_environment + + # Do we need to allocate resources? i.e., are there idle condor jobs for + # which we are responsible? + # TODO condor query for idle jobs with our batch_name + + # The environment command in the submit file is a double-quoted, + # whitespace-delimited list of name=value pairs where literal quote marks + # are doubled ("" or ''). + # Set the htcondor config in the submission environment + # FIXME we could instead take the result of + # common.htcondor.build_htcondor_submit_environment + # submission_environment = " ".join([f"{k}={v}" for k, v in condor_environment.items()]) # noqa: W505 + # submission_environment += " " + # # Add minimal additional variables in the submission environment + # submission_environment += ( + # f"PATH={config.htcondor.remote_user_home}/.local/bin:{config.slurm.home}:" # noqa: W505 + # f"/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin " + # f"HOME={config.htcondor.remote_user_home} " + # ) + submission_environment = " ".join([f"{k}={v}" for k, v in build_htcondor_submit_environment().items()]) + + # TODO Assumes that the following executable is on the provided PATH, but + # could write a shell script wrapper to use. + submission_spec = { + "executable": f"{config.htcondor.remote_user_home}/.local/bin/allocateNodes.py", + "arguments": ( + f"--auto --account {config.slurm.account} -n 50 -m 4-00:00:00 " + f"-q {config.slurm.partition} -g 240 {config.slurm.platform}" + ), + "environment": f'"{submission_environment}"', + "initialdir": config.htcondor.working_directory, + "batch_name": config.htcondor.batch_name, + "universe": "local", + } + submit = htcondor.Submit(submission_spec) + + coll = htcondor.Collector(config.htcondor.collector_host) + location_ad = coll.locate(htcondor.DaemonTypes.Schedd) + schedd = htcondor.Schedd(location_ad) + + cluster_id = schedd.submit(submit) + + logger.info("Allocating Resources with condor job %s", cluster_id) diff --git a/src/lsst/cmservice/common/htcondor.py b/src/lsst/cmservice/common/htcondor.py index ea24052c..45f6a0ce 100644 --- a/src/lsst/cmservice/common/htcondor.py +++ b/src/lsst/cmservice/common/htcondor.py @@ -186,23 +186,25 @@ def build_htcondor_submit_environment() -> Mapping[str, str]: should closer match the environment of an interactive sdfianaXXX user at SLAC. """ + # TODO use all configured htcondor config settings + # condor_environment = config.htcondor.model_dump(by_alias=True) return dict( - CONDOR_CONFIG="ONLY_ENV", + CONDOR_CONFIG=config.htcondor.config_source, _CONDOR_CONDOR_HOST=config.htcondor.collector_host, _CONDOR_COLLECTOR_HOST=config.htcondor.collector_host, _CONDOR_SCHEDD_HOST=config.htcondor.schedd_host, _CONDOR_SEC_CLIENT_AUTHENTICATION_METHODS=config.htcondor.authn_methods, - _CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV=str(config.htcondor.dagman_job_append_get_env), - DAF_BUTLER_REPOSITORY_INDEX=config.butler.repository_index, + _CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV="True", FS_REMOTE_DIR=config.htcondor.fs_remote_dir, - HOME=config.htcondor.user_home, + DAF_BUTLER_REPOSITORY_INDEX=config.butler.repository_index, + HOME=config.htcondor.remote_user_home, LSST_VERSION=config.bps.lsst_version, LSST_DISTRIB_DIR=config.bps.lsst_distrib_dir, # FIX: because there is no db-auth.yaml in lsstsvc1's home directory - PGPASSFILE=f"{config.htcondor.user_home}/.lsst/postgres-credentials.txt", + PGPASSFILE=f"{config.htcondor.remote_user_home}/.lsst/postgres-credentials.txt", PGUSER=config.butler.default_username, PATH=( - f"{config.htcondor.user_home}/.local/bin:{config.htcondor.user_home}/bin:{config.slurm.home}:" + f"{config.htcondor.remote_user_home}/.local/bin:{config.htcondor.remote_user_home}/bin:{config.slurm.home}:" f"/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin" ), ) diff --git a/src/lsst/cmservice/config.py b/src/lsst/cmservice/config.py index 1b416ccd..15c1875b 100644 --- a/src/lsst/cmservice/config.py +++ b/src/lsst/cmservice/config.py @@ -114,9 +114,16 @@ class HTCondorConfiguration(BaseModel): their serialization alias. """ - user_home: str = Field( + config_source: str = Field( + description="Source of htcondor configuration", + default="ONLY_ENV", + serialization_alias="CONDOR_CONFIG", + ) + + remote_user_home: str = Field( description=("Path to the user's home directory, as resolvable from an htcondor access node."), default="/sdf/home/l/lsstsvc1", + exclude=True, ) condor_home: str = Field( @@ -207,10 +214,12 @@ class HTCondorConfiguration(BaseModel): serialization_alias="FS_REMOTE_DIR", ) - # FIXME: unclear if this is at all necessary - dagman_job_append_get_env: bool = Field( - description="...", default=True, serialization_alias="_CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV" - ) + # FIXME: unclear if this is necessary or specific to bps submit jobs + # dagman_job_append_get_env: str = Field( + # description="...", + # default="true", + # serialization_alias="_CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV", + # ) # TODO deprecate and remove "slurm"-specific logic from cm-service; it is @@ -249,6 +258,11 @@ class SlurmConfiguration(BaseModel): default="milano", ) + platform: str = Field( + description="Platform requested when submitting a slurm job.", + default="s3df", + ) + class AsgiConfiguration(BaseModel): """Configuration for the application's ASGI web server.""" @@ -309,6 +323,11 @@ class DaemonConfiguration(BaseModel): Set according to DAEMON__FIELD environment variables. """ + allocate_resources: bool = Field( + default=False, + description="Whether the daemon should try to allocate its own htcondor or slurm resources.", + ) + processing_interval: int = Field( default=30, description=(