From 37d382f91941570fa810d20794bd715e77a2d598 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 18 May 2026 18:59:59 +0000 Subject: [PATCH 1/4] Expose gigl vars as env vars for the custom luancher --- gigl/env/custom_launcher.py | 16 +++ gigl/src/common/custom_launcher.py | 71 ++++++++--- tests/unit/src/common/custom_launcher_test.py | 111 ++++++++++++++++++ 3 files changed, 180 insertions(+), 18 deletions(-) create mode 100644 gigl/env/custom_launcher.py diff --git a/gigl/env/custom_launcher.py b/gigl/env/custom_launcher.py new file mode 100644 index 000000000..21eb4c223 --- /dev/null +++ b/gigl/env/custom_launcher.py @@ -0,0 +1,16 @@ +"""Environment-variable keys exported by ``launch_custom``. + +These keys are set on the subprocess env (never on the parent +``os.environ``) by ``gigl.src.common.custom_launcher.launch_custom`` so +that receiving CLIs can ``os.environ.get(...)`` their runtime context. +""" + +from typing import Final + +GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY: Final[str] = "GIGL_APPLIED_TASK_IDENTIFIER" +GIGL_TASK_CONFIG_URI_ENV_KEY: Final[str] = "GIGL_TASK_CONFIG_URI" +GIGL_RESOURCE_CONFIG_URI_ENV_KEY: Final[str] = "GIGL_RESOURCE_CONFIG_URI" +GIGL_PROCESS_COMMAND_ENV_KEY: Final[str] = "GIGL_PROCESS_COMMAND" +GIGL_CPU_DOCKER_URI_ENV_KEY: Final[str] = "GIGL_CPU_DOCKER_URI" +GIGL_CUDA_DOCKER_URI_ENV_KEY: Final[str] = "GIGL_CUDA_DOCKER_URI" +GIGL_COMPONENT_ENV_KEY: Final[str] = "GIGL_COMPONENT" diff --git a/gigl/src/common/custom_launcher.py b/gigl/src/common/custom_launcher.py index 10c3115e9..f264782d6 100644 --- a/gigl/src/common/custom_launcher.py +++ b/gigl/src/common/custom_launcher.py @@ -12,8 +12,16 @@ dynamic content (runtime URIs, image refs, etc.) is the caller's responsibility — typically resolved at YAML-load time before the proto reaches this module. + +The dispatcher exports its context args as ``GIGL_*`` environment +variables on the subprocess env (see ``gigl.env.custom_launcher``) so +receiving CLIs can ``os.environ.get(...)`` whatever runtime context +they need. The parent process's ``os.environ`` is never mutated; the +``GIGL_*`` keys live only in the per-call env passed to +``subprocess.run``. """ +import os import shlex import subprocess from collections.abc import Mapping @@ -21,6 +29,15 @@ from gigl.common import Uri from gigl.common.logger import Logger +from gigl.env.custom_launcher import ( + GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, + GIGL_COMPONENT_ENV_KEY, + GIGL_CPU_DOCKER_URI_ENV_KEY, + GIGL_CUDA_DOCKER_URI_ENV_KEY, + GIGL_PROCESS_COMMAND_ENV_KEY, + GIGL_RESOURCE_CONFIG_URI_ENV_KEY, + GIGL_TASK_CONFIG_URI_ENV_KEY, +) from gigl.src.common.constants.components import GiGLComponents from snapchat.research.gbml.gigl_resource_config_pb2 import CustomLauncherConfig @@ -46,7 +63,7 @@ def launch_custom( Composes a shell line as ``command`` followed by each ``args[]`` element passed through ``shlex.quote``, then invokes - ``subprocess.run(shell_line, shell=True, check=True)``. + ``subprocess.run(shell_line, shell=True, check=True, env=env)``. The dispatcher takes ``command`` and ``args[]`` verbatim — no template substitution of any kind. Any placeholder text in those @@ -54,28 +71,35 @@ def launch_custom( substitution should resolve it at YAML-load time before the proto reaches this module. - ``applied_task_identifier``, ``task_config_uri``, - ``resource_config_uri``, ``process_command``, - ``process_runtime_args``, ``cpu_docker_uri``, and ``cuda_docker_uri`` - are accepted for API symmetry with the GLT-side Vertex AI launchers - but are intentionally not plumbed into the subprocess — the - receiving CLI is expected to source whatever context it needs from - the resource config it gets handed (or from env vars inherited from - the parent process). + The subprocess env is built per-call from ``os.environ.copy()`` plus + the ``GIGL_*`` keys defined in :mod:`gigl.env.custom_launcher`. The + parent process's ``os.environ`` is never mutated. Optional URI args + (``cpu_docker_uri``, ``cuda_docker_uri``) are omitted from the env + when ``None`` so the receiver's ``os.environ.get(KEY)`` returns + ``None`` and preserves the original ``Optional[str]`` semantics. Args: custom_launcher_config: Proto whose ``command`` is the shell snippet to execute and whose ``args`` are positional arguments appended verbatim. - applied_task_identifier: Accepted for back-compat; ignored. - task_config_uri: Accepted for back-compat; ignored. - resource_config_uri: Accepted for back-compat; ignored. - process_command: Accepted for back-compat; ignored. - process_runtime_args: Accepted for back-compat; ignored. - cpu_docker_uri: Accepted for back-compat; ignored. - cuda_docker_uri: Accepted for back-compat; ignored. + applied_task_identifier: Exported to the subprocess as + ``GIGL_APPLIED_TASK_IDENTIFIER``. + task_config_uri: Exported to the subprocess as + ``GIGL_TASK_CONFIG_URI`` (stringified). + resource_config_uri: Exported to the subprocess as + ``GIGL_RESOURCE_CONFIG_URI`` (stringified). + process_command: Exported to the subprocess as + ``GIGL_PROCESS_COMMAND``. + process_runtime_args: Accepted for API symmetry with the + GLT-side Vertex AI launchers but not currently exported — + there is no clean single-env-var encoding for a dict. + cpu_docker_uri: Exported as ``GIGL_CPU_DOCKER_URI`` when set; + the env var is omitted entirely when ``None``. + cuda_docker_uri: Exported as ``GIGL_CUDA_DOCKER_URI`` when set; + the env var is omitted entirely when ``None``. component: Which GiGL component is being launched. Must be in - ``_LAUNCHABLE_COMPONENTS``. + ``_LAUNCHABLE_COMPONENTS``. Exported as ``GIGL_COMPONENT`` + using ``component.name`` (e.g. ``"Trainer"``). Raises: ValueError: If ``component`` is not Trainer or Inferencer, or if @@ -91,6 +115,17 @@ def launch_custom( command: str = custom_launcher_config.command args: list[str] = list(custom_launcher_config.args) + env: dict[str, str] = os.environ.copy() + env[GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY] = applied_task_identifier + env[GIGL_TASK_CONFIG_URI_ENV_KEY] = str(task_config_uri) + env[GIGL_RESOURCE_CONFIG_URI_ENV_KEY] = str(resource_config_uri) + env[GIGL_PROCESS_COMMAND_ENV_KEY] = process_command + env[GIGL_COMPONENT_ENV_KEY] = component.name + if cpu_docker_uri is not None: + env[GIGL_CPU_DOCKER_URI_ENV_KEY] = cpu_docker_uri + if cuda_docker_uri is not None: + env[GIGL_CUDA_DOCKER_URI_ENV_KEY] = cuda_docker_uri + shell_line = " ".join([command, *(shlex.quote(a) for a in args)]) logger.info(f"Launching {component.name} via subprocess: {shell_line!r}") - subprocess.run(shell_line, shell=True, check=True) + subprocess.run(shell_line, shell=True, check=True, env=env) diff --git a/tests/unit/src/common/custom_launcher_test.py b/tests/unit/src/common/custom_launcher_test.py index 6a8765352..94b3a2100 100644 --- a/tests/unit/src/common/custom_launcher_test.py +++ b/tests/unit/src/common/custom_launcher_test.py @@ -1,10 +1,20 @@ """Unit tests for ``gigl.src.common.custom_launcher``.""" +import os from unittest.mock import MagicMock, patch from absl.testing import absltest from gigl.common import Uri +from gigl.env.custom_launcher import ( + GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, + GIGL_COMPONENT_ENV_KEY, + GIGL_CPU_DOCKER_URI_ENV_KEY, + GIGL_CUDA_DOCKER_URI_ENV_KEY, + GIGL_PROCESS_COMMAND_ENV_KEY, + GIGL_RESOURCE_CONFIG_URI_ENV_KEY, + GIGL_TASK_CONFIG_URI_ENV_KEY, +) from gigl.src.common.constants.components import GiGLComponents from gigl.src.common.custom_launcher import launch_custom from snapchat.research.gbml import gigl_resource_config_pb2 @@ -113,6 +123,107 @@ def test_args_with_spaces_are_shell_quoted(self, mock_run: MagicMock) -> None: self.assertIn("'a b c'", shell_line) self.assertIn("'--name=with space'", shell_line) + @patch("gigl.src.common.custom_launcher.subprocess.run") + def test_dispatch_sets_gigl_env_vars(self, mock_run: MagicMock) -> None: + config = self._build_config(command="python -m my.cli") + launch_custom( + custom_launcher_config=config, + applied_task_identifier="job-42", + task_config_uri=Uri("gs://bucket/task.yaml"), + resource_config_uri=Uri("gs://bucket/resource.yaml"), + process_command="python -m my.cli", + process_runtime_args={}, + cpu_docker_uri="gcr.io/p/cpu:tag", + cuda_docker_uri="gcr.io/p/cuda:tag", + component=GiGLComponents.Trainer, + ) + env = mock_run.call_args.kwargs["env"] + self.assertEqual(env[GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY], "job-42") + self.assertEqual(env[GIGL_TASK_CONFIG_URI_ENV_KEY], "gs://bucket/task.yaml") + self.assertEqual( + env[GIGL_RESOURCE_CONFIG_URI_ENV_KEY], "gs://bucket/resource.yaml" + ) + self.assertEqual(env[GIGL_PROCESS_COMMAND_ENV_KEY], "python -m my.cli") + self.assertEqual(env[GIGL_CPU_DOCKER_URI_ENV_KEY], "gcr.io/p/cpu:tag") + self.assertEqual(env[GIGL_CUDA_DOCKER_URI_ENV_KEY], "gcr.io/p/cuda:tag") + # component is exported via .name (the enum member identifier). + self.assertEqual(env[GIGL_COMPONENT_ENV_KEY], "Trainer") + + @patch("gigl.src.common.custom_launcher.subprocess.run") + def test_dispatch_omits_optional_uris_when_none(self, mock_run: MagicMock) -> None: + config = self._build_config(command="echo") + launch_custom( + custom_launcher_config=config, + applied_task_identifier="job", + task_config_uri=Uri("gs://bucket/task.yaml"), + resource_config_uri=Uri("gs://bucket/resource.yaml"), + process_command="echo", + process_runtime_args={}, + cpu_docker_uri=None, + cuda_docker_uri=None, + component=GiGLComponents.Inferencer, + ) + env = mock_run.call_args.kwargs["env"] + # Optional URIs must be omitted entirely (not stringified to "None" + # nor set to ""), so receivers see env.get(KEY) is None. + self.assertNotIn(GIGL_CPU_DOCKER_URI_ENV_KEY, env) + self.assertNotIn(GIGL_CUDA_DOCKER_URI_ENV_KEY, env) + # Required keys are still present. + self.assertEqual(env[GIGL_COMPONENT_ENV_KEY], "Inferencer") + + @patch("gigl.src.common.custom_launcher.subprocess.run") + def test_dispatch_does_not_mutate_parent_os_environ( + self, mock_run: MagicMock + ) -> None: + # Pre-condition: none of the GIGL_* keys leak into the parent. + snapshot = dict(os.environ) + config = self._build_config(command="echo") + launch_custom( + custom_launcher_config=config, + applied_task_identifier="job", + task_config_uri=Uri("gs://bucket/task.yaml"), + resource_config_uri=Uri("gs://bucket/resource.yaml"), + process_command="echo", + process_runtime_args={}, + cpu_docker_uri="gcr.io/p/cpu:tag", + cuda_docker_uri="gcr.io/p/cuda:tag", + component=GiGLComponents.Trainer, + ) + self.assertEqual(dict(os.environ), snapshot) + for key in ( + GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, + GIGL_TASK_CONFIG_URI_ENV_KEY, + GIGL_RESOURCE_CONFIG_URI_ENV_KEY, + GIGL_PROCESS_COMMAND_ENV_KEY, + GIGL_CPU_DOCKER_URI_ENV_KEY, + GIGL_CUDA_DOCKER_URI_ENV_KEY, + GIGL_COMPONENT_ENV_KEY, + ): + self.assertNotIn(key, os.environ) + + @patch("gigl.src.common.custom_launcher.subprocess.run") + def test_dispatch_preserves_inherited_env(self, mock_run: MagicMock) -> None: + sentinel_key = "GIGL_TEST_PARENT_ENV_SENTINEL" + sentinel_value = "preserved-value" + try: + os.environ[sentinel_key] = sentinel_value + config = self._build_config(command="echo") + launch_custom( + custom_launcher_config=config, + applied_task_identifier="job", + task_config_uri=Uri("gs://bucket/task.yaml"), + resource_config_uri=Uri("gs://bucket/resource.yaml"), + process_command="echo", + process_runtime_args={}, + cpu_docker_uri=None, + cuda_docker_uri=None, + component=GiGLComponents.Trainer, + ) + env = mock_run.call_args.kwargs["env"] + self.assertEqual(env.get(sentinel_key), sentinel_value) + finally: + os.environ.pop(sentinel_key, None) + if __name__ == "__main__": absltest.main() From 8c1a09fb4fcca1ee2e18d0d51b49fd7826624b23 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 19 May 2026 20:32:35 +0000 Subject: [PATCH 2/4] PR comments --- gigl/env/{custom_launcher.py => constants.py} | 0 gigl/src/common/custom_launcher.py | 39 ++- gigl/utils/dev/__init__.py | 5 + gigl/utils/dev/submit_smoke_job.py | 258 ++++++++++++++++++ gigl/utils/dev/tb_smoke_main.py | 72 +++++ tests/unit/src/common/custom_launcher_test.py | 73 +++-- 6 files changed, 394 insertions(+), 53 deletions(-) rename gigl/env/{custom_launcher.py => constants.py} (100%) create mode 100644 gigl/utils/dev/__init__.py create mode 100644 gigl/utils/dev/submit_smoke_job.py create mode 100644 gigl/utils/dev/tb_smoke_main.py diff --git a/gigl/env/custom_launcher.py b/gigl/env/constants.py similarity index 100% rename from gigl/env/custom_launcher.py rename to gigl/env/constants.py diff --git a/gigl/src/common/custom_launcher.py b/gigl/src/common/custom_launcher.py index f264782d6..08770cd77 100644 --- a/gigl/src/common/custom_launcher.py +++ b/gigl/src/common/custom_launcher.py @@ -14,7 +14,7 @@ proto reaches this module. The dispatcher exports its context args as ``GIGL_*`` environment -variables on the subprocess env (see ``gigl.env.custom_launcher``) so +variables on the subprocess env (see ``gigl.env.constants``) so receiving CLIs can ``os.environ.get(...)`` whatever runtime context they need. The parent process's ``os.environ`` is never mutated; the ``GIGL_*`` keys live only in the per-call env passed to @@ -28,8 +28,12 @@ from typing import Optional from gigl.common import Uri +from gigl.common.constants import ( + DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU, + DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA, +) from gigl.common.logger import Logger -from gigl.env.custom_launcher import ( +from gigl.env.constants import ( GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, GIGL_COMPONENT_ENV_KEY, GIGL_CPU_DOCKER_URI_ENV_KEY, @@ -72,11 +76,13 @@ def launch_custom( reaches this module. The subprocess env is built per-call from ``os.environ.copy()`` plus - the ``GIGL_*`` keys defined in :mod:`gigl.env.custom_launcher`. The - parent process's ``os.environ`` is never mutated. Optional URI args - (``cpu_docker_uri``, ``cuda_docker_uri``) are omitted from the env - when ``None`` so the receiver's ``os.environ.get(KEY)`` returns - ``None`` and preserves the original ``Optional[str]`` semantics. + the ``GIGL_*`` keys defined in :mod:`gigl.env.constants`. The + parent process's ``os.environ`` is never mutated. When ``None`` is + passed for ``cpu_docker_uri`` / ``cuda_docker_uri``, the + corresponding env var falls back to + :data:`gigl.common.constants.DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` / + :data:`gigl.common.constants.DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA` + so receivers always observe a usable image URI. Args: custom_launcher_config: Proto whose ``command`` is the shell @@ -93,10 +99,11 @@ def launch_custom( process_runtime_args: Accepted for API symmetry with the GLT-side Vertex AI launchers but not currently exported — there is no clean single-env-var encoding for a dict. - cpu_docker_uri: Exported as ``GIGL_CPU_DOCKER_URI`` when set; - the env var is omitted entirely when ``None``. - cuda_docker_uri: Exported as ``GIGL_CUDA_DOCKER_URI`` when set; - the env var is omitted entirely when ``None``. + cpu_docker_uri: Exported as ``GIGL_CPU_DOCKER_URI``. Falls back + to ``DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU`` when ``None``. + cuda_docker_uri: Exported as ``GIGL_CUDA_DOCKER_URI``. Falls + back to ``DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA`` when + ``None``. component: Which GiGL component is being launched. Must be in ``_LAUNCHABLE_COMPONENTS``. Exported as ``GIGL_COMPONENT`` using ``component.name`` (e.g. ``"Trainer"``). @@ -121,10 +128,12 @@ def launch_custom( env[GIGL_RESOURCE_CONFIG_URI_ENV_KEY] = str(resource_config_uri) env[GIGL_PROCESS_COMMAND_ENV_KEY] = process_command env[GIGL_COMPONENT_ENV_KEY] = component.name - if cpu_docker_uri is not None: - env[GIGL_CPU_DOCKER_URI_ENV_KEY] = cpu_docker_uri - if cuda_docker_uri is not None: - env[GIGL_CUDA_DOCKER_URI_ENV_KEY] = cuda_docker_uri + env[GIGL_CPU_DOCKER_URI_ENV_KEY] = ( + cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU + ) + env[GIGL_CUDA_DOCKER_URI_ENV_KEY] = ( + cuda_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA + ) shell_line = " ".join([command, *(shlex.quote(a) for a in args)]) logger.info(f"Launching {component.name} via subprocess: {shell_line!r}") diff --git a/gigl/utils/dev/__init__.py b/gigl/utils/dev/__init__.py new file mode 100644 index 000000000..9c1bf25ab --- /dev/null +++ b/gigl/utils/dev/__init__.py @@ -0,0 +1,5 @@ +"""Developer utilities (smoke entrypoints, ad-hoc test helpers). + +Modules under this package are intended for short, ad-hoc test jobs and +developer iteration. They are NOT part of GiGL's stable public API. +""" diff --git a/gigl/utils/dev/submit_smoke_job.py b/gigl/utils/dev/submit_smoke_job.py new file mode 100644 index 000000000..e1eab73ef --- /dev/null +++ b/gigl/utils/dev/submit_smoke_job.py @@ -0,0 +1,258 @@ +"""Submit a tiny Vertex AI CustomJob that exercises GiGL's TensorBoard wiring. + +Goal: <2 min from "I changed launcher / writer code" to "I see whether TB +shows up." Bypasses ConfigPopulator and the full pipeline; uses the +production launcher path (``launch_single_pool_job``) so the same submit +logic runs as in real training. + +Required CLI flags: + --project GCP project (e.g. ``external-snap-ci-github-gigl``). + --region Vertex AI region (e.g. ``us-central1``). + --service-account Service account email used by the CustomJob. + --staging-bucket Regional GCS bucket Vertex stages artifacts under. + --tensorboard Full TensorBoard resource name + (``projects/.../locations/.../tensorboards/...``). + --experiment-name Vertex AI ``TensorboardExperiment`` name. The + tb_smoke_main entry point will pass this and the + --tensorboard value to ``TensorBoardWriter.create``. + --container-uri Container image to use. REQUIRED — must contain the + branch under test. + +Optional: + --job-name CustomJob display name. Defaults to a timestamped + ``gigl-tb-smoke-...``. + --dry-run Print the constructed submission parameters and + exit without submitting. + +Verification: + After the CustomJob completes the script polls the TensorBoard API + surface and asserts the user-named ``TensorboardExperiment`` exists + with at least one ``TensorboardRun`` containing time series data. + + The TB UI URL is printed for manual inspection. +""" + +from __future__ import annotations + +import argparse +import datetime +import re +import sys +import time + +from google.cloud import aiplatform + +from gigl.common import Uri +from gigl.common.logger import Logger +from gigl.src.common.constants.components import GiGLComponents +from gigl.src.common.types.pb_wrappers.gigl_resource_config import ( + GiglResourceConfigWrapper, +) +from gigl.src.common.vertex_ai_launcher import launch_single_pool_job +from snapchat.research.gbml import gigl_resource_config_pb2 + +logger = Logger() + +_TENSORBOARD_RESOURCE_NAME_PATTERN = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--project", required=True) + parser.add_argument("--region", required=True) + parser.add_argument("--service-account", required=True) + parser.add_argument( + "--staging-bucket", + required=True, + help="Regional GCS bucket (e.g. gs://gigl-cicd-temp).", + ) + parser.add_argument( + "--tensorboard", + required=True, + help="Full TensorBoard resource name.", + ) + parser.add_argument( + "--experiment-name", + required=True, + help=( + "TensorboardExperiment name. Passed to tb_smoke_main, which " + "creates the run under this experiment." + ), + ) + parser.add_argument( + "--container-uri", + required=True, + help=( + "Container image with the branch code. Required; pointing at a " + "released image would test stale code." + ), + ) + parser.add_argument("--job-name", default=None) + parser.add_argument("--dry-run", action="store_true") + return parser.parse_args() + + +def _build_resource_config( + *, + project: str, + region: str, + service_account: str, + staging_bucket: str, +) -> gigl_resource_config_pb2.GiglResourceConfig: + """Minimal GiglResourceConfig wired for a 1-replica CPU CustomJob.""" + common = gigl_resource_config_pb2.SharedResourceConfig.CommonComputeConfig( + project=project, + region=region, + temp_regional_assets_bucket=staging_bucket, + temp_assets_bucket=staging_bucket, + perm_assets_bucket=staging_bucket, + temp_assets_bq_dataset_name="not_used_by_smoke", + embedding_bq_dataset_name="not_used_by_smoke", + gcp_service_account_email=service_account, + dataflow_runner="DataflowRunner", + ) + shared = gigl_resource_config_pb2.SharedResourceConfig( + common_compute_config=common, + resource_labels={"cost_resource_group": "gigl_dev_smoke"}, + ) + trainer = gigl_resource_config_pb2.VertexAiResourceConfig( + # n1-standard-2 is rejected by Vertex AI; n1-standard-16 is the + # smallest spec we've confirmed accepted in dev. + machine_type="n1-standard-16", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + timeout=600, + ) + return gigl_resource_config_pb2.GiglResourceConfig( + shared_resource_config=shared, + trainer_resource_config=gigl_resource_config_pb2.TrainerResourceConfig( + vertex_ai_trainer_config=trainer, + ), + ) + + +def _verify_named_experiment( + *, + tensorboard_resource_name: str, + experiment_name: str, +) -> None: + """Confirm the chief-rank writer ingested events into the named experiment.""" + experiment_resource_name = ( + f"{tensorboard_resource_name}/experiments/{experiment_name}" + ) + runs = aiplatform.TensorboardRun.list( + tensorboard_experiment_name=experiment_resource_name, + ) + if not runs: + raise RuntimeError( + f"Named TensorboardExperiment {experiment_resource_name} has no " + "TensorboardRuns; the writer did not ingest events." + ) + for run in runs: + time_series = aiplatform.TensorboardTimeSeries.list( + tensorboard_run_name=run.resource_name, + ) + if not time_series: + raise RuntimeError( + f"Run {run.resource_name} has no TensorboardTimeSeries; " + "events did not reach the API." + ) + run_names = sorted(r.display_name for r in runs) + logger.info( + f"Named experiment OK: {len(runs)} run(s) under {experiment_resource_name}: " + f"{run_names}" + ) + + +def _print_tb_url( + *, + region: str, + project: str, + tensorboard_id: str, + experiment_name: str, +) -> None: + base = f"https://{region}.tensorboard.googleusercontent.com/experiment" + qualifier = f"projects+{project}+locations+{region}+tensorboards+{tensorboard_id}" + named = f"{base}/{qualifier}+experiments+{experiment_name}" + logger.info(f"Named TB URL: {named}") + + +def main() -> int: + args = _parse_args() + + tb_match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(args.tensorboard) + if not tb_match: + logger.error( + f"--tensorboard must be projects/.../locations/.../tensorboards/...; " + f"got {args.tensorboard!r}." + ) + return 2 + + timestamp = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + job_name = args.job_name or f"gigl-tb-smoke-{timestamp}" + + resource_config = _build_resource_config( + project=args.project, + region=args.region, + service_account=args.service_account, + staging_bucket=args.staging_bucket, + ) + resource_wrapper = GiglResourceConfigWrapper(resource_config=resource_config) + + process_runtime_args = { + "tensorboard_resource_name": args.tensorboard, + "tensorboard_experiment_name": args.experiment_name, + } + + if args.dry_run: + logger.info( + "Dry run — would submit a CustomJob with:\n" + f" job_name = {job_name}\n" + f" container_uri = {args.container_uri}\n" + f" tensorboard_resource = {args.tensorboard}\n" + f" experiment_name = {args.experiment_name!r}\n" + f" process_runtime_args = {process_runtime_args}\n" + ) + return 0 + + aiplatform.init(project=args.project, location=args.region) + launch_single_pool_job( + vertex_ai_resource_config=resource_config.trainer_resource_config.vertex_ai_trainer_config, + job_name=job_name, + task_config_uri=Uri("gs://unused/by/smoke.yaml"), + resource_config_uri=Uri("gs://unused/by/smoke.yaml"), + process_command="python -m gigl.utils.dev.tb_smoke_main", + process_runtime_args=process_runtime_args, + resource_config_wrapper=resource_wrapper, + cpu_docker_uri=args.container_uri, + cuda_docker_uri=args.container_uri, + component=GiGLComponents.Trainer, + vertex_ai_region=args.region, + ) + logger.info(f"Submitted CustomJob: {job_name}") + + # CustomJob.submit blocks until completion inside launch_single_pool_job + # (see VertexAIService._submit_job: job.wait_for_completion). Give the + # backing TensorboardExperiment a short grace period for any final RPCs. + time.sleep(5) + + _verify_named_experiment( + tensorboard_resource_name=args.tensorboard, + experiment_name=args.experiment_name, + ) + _print_tb_url( + region=args.region, + project=args.project, + tensorboard_id=tb_match["tensorboard_id"], + experiment_name=args.experiment_name, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/gigl/utils/dev/tb_smoke_main.py b/gigl/utils/dev/tb_smoke_main.py new file mode 100644 index 000000000..2d4a35807 --- /dev/null +++ b/gigl/utils/dev/tb_smoke_main.py @@ -0,0 +1,72 @@ +"""Tiny smoke-test entrypoint that exercises GiGL's TensorBoard pipeline. + +Submitted as the container command by ``submit_smoke_job.py``. Constructs a +``TensorBoardWriter`` with ``enabled=True`` (single-process smoke = always +chief), writes a few scalar events, and exits. + +Configuration is plumbed via CLI flags injected by the launcher from the +smoke script's ``process_runtime_args`` map. All three are required: + + --job_name= + --tensorboard_resource_name= + --tensorboard_experiment_name= + +This entrypoint deliberately mirrors the production trainer/inferencer call +sites in ``examples/link_prediction/`` so the smoke test exercises the same +``TensorBoardWriter.create()`` code path. +""" + +from __future__ import annotations + +import argparse + +from gigl.common.logger import Logger +from gigl.utils.tensorboard_writer import TensorBoardWriter + +logger = Logger() + +_NUM_STEPS = 3 + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--job_name", + required=True, + help="Used as the TensorboardRun ID (must be unique per launch).", + ) + parser.add_argument( + "--tensorboard_resource_name", + required=True, + help="Full Vertex AI Tensorboard resource name.", + ) + parser.add_argument( + "--tensorboard_experiment_name", + required=True, + help="TensorboardExperiment ID under the resource above.", + ) + # The launcher's _build_job_config always appends --task_config_uri, + # --resource_config_uri, and (on GPU) --use_cuda. The smoke entrypoint + # doesn't need them; use parse_known_args so they don't blow up argparse. + args, _unrecognized = parser.parse_known_args() + return args + + +def main() -> None: + """Write a handful of scalar events and exit.""" + args = _parse_args() + logger.info(f"Starting tb_smoke_main; job_name={args.job_name!r}") + with TensorBoardWriter.create( + resource_name=args.tensorboard_resource_name, + experiment_name=args.tensorboard_experiment_name, + experiment_run_name=args.job_name, + enabled=True, + ) as writer: + for step in range(_NUM_STEPS): + writer.log({"smoke/value": float(step)}, step=step) + logger.info(f"Wrote smoke/value={step} at step {step}") + logger.info("tb_smoke_main complete") + + +if __name__ == "__main__": + main() diff --git a/tests/unit/src/common/custom_launcher_test.py b/tests/unit/src/common/custom_launcher_test.py index 94b3a2100..3d325cbb5 100644 --- a/tests/unit/src/common/custom_launcher_test.py +++ b/tests/unit/src/common/custom_launcher_test.py @@ -6,7 +6,11 @@ from absl.testing import absltest from gigl.common import Uri -from gigl.env.custom_launcher import ( +from gigl.common.constants import ( + DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU, + DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA, +) +from gigl.env.constants import ( GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, GIGL_COMPONENT_ENV_KEY, GIGL_CPU_DOCKER_URI_ENV_KEY, @@ -150,7 +154,9 @@ def test_dispatch_sets_gigl_env_vars(self, mock_run: MagicMock) -> None: self.assertEqual(env[GIGL_COMPONENT_ENV_KEY], "Trainer") @patch("gigl.src.common.custom_launcher.subprocess.run") - def test_dispatch_omits_optional_uris_when_none(self, mock_run: MagicMock) -> None: + def test_dispatch_defaults_optional_uris_to_release_images( + self, mock_run: MagicMock + ) -> None: config = self._build_config(command="echo") launch_custom( custom_launcher_config=config, @@ -164,49 +170,26 @@ def test_dispatch_omits_optional_uris_when_none(self, mock_run: MagicMock) -> No component=GiGLComponents.Inferencer, ) env = mock_run.call_args.kwargs["env"] - # Optional URIs must be omitted entirely (not stringified to "None" - # nor set to ""), so receivers see env.get(KEY) is None. - self.assertNotIn(GIGL_CPU_DOCKER_URI_ENV_KEY, env) - self.assertNotIn(GIGL_CUDA_DOCKER_URI_ENV_KEY, env) - # Required keys are still present. + # When the caller passes None for a docker URI, the env var + # falls back to the public release image so receivers always + # see a usable URI. + self.assertEqual( + env[GIGL_CPU_DOCKER_URI_ENV_KEY], DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU + ) + self.assertEqual( + env[GIGL_CUDA_DOCKER_URI_ENV_KEY], DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA + ) self.assertEqual(env[GIGL_COMPONENT_ENV_KEY], "Inferencer") @patch("gigl.src.common.custom_launcher.subprocess.run") - def test_dispatch_does_not_mutate_parent_os_environ( + def test_dispatch_isolates_subprocess_env_from_parent( self, mock_run: MagicMock ) -> None: - # Pre-condition: none of the GIGL_* keys leak into the parent. - snapshot = dict(os.environ) - config = self._build_config(command="echo") - launch_custom( - custom_launcher_config=config, - applied_task_identifier="job", - task_config_uri=Uri("gs://bucket/task.yaml"), - resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="echo", - process_runtime_args={}, - cpu_docker_uri="gcr.io/p/cpu:tag", - cuda_docker_uri="gcr.io/p/cuda:tag", - component=GiGLComponents.Trainer, - ) - self.assertEqual(dict(os.environ), snapshot) - for key in ( - GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, - GIGL_TASK_CONFIG_URI_ENV_KEY, - GIGL_RESOURCE_CONFIG_URI_ENV_KEY, - GIGL_PROCESS_COMMAND_ENV_KEY, - GIGL_CPU_DOCKER_URI_ENV_KEY, - GIGL_CUDA_DOCKER_URI_ENV_KEY, - GIGL_COMPONENT_ENV_KEY, - ): - self.assertNotIn(key, os.environ) - - @patch("gigl.src.common.custom_launcher.subprocess.run") - def test_dispatch_preserves_inherited_env(self, mock_run: MagicMock) -> None: sentinel_key = "GIGL_TEST_PARENT_ENV_SENTINEL" sentinel_value = "preserved-value" try: os.environ[sentinel_key] = sentinel_value + snapshot = dict(os.environ) config = self._build_config(command="echo") launch_custom( custom_launcher_config=config, @@ -215,10 +198,24 @@ def test_dispatch_preserves_inherited_env(self, mock_run: MagicMock) -> None: resource_config_uri=Uri("gs://bucket/resource.yaml"), process_command="echo", process_runtime_args={}, - cpu_docker_uri=None, - cuda_docker_uri=None, + cpu_docker_uri="gcr.io/p/cpu:tag", + cuda_docker_uri="gcr.io/p/cuda:tag", component=GiGLComponents.Trainer, ) + # Parent os.environ is untouched; none of the GIGL_* keys + # leak into it. + self.assertEqual(dict(os.environ), snapshot) + for key in ( + GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, + GIGL_TASK_CONFIG_URI_ENV_KEY, + GIGL_RESOURCE_CONFIG_URI_ENV_KEY, + GIGL_PROCESS_COMMAND_ENV_KEY, + GIGL_CPU_DOCKER_URI_ENV_KEY, + GIGL_CUDA_DOCKER_URI_ENV_KEY, + GIGL_COMPONENT_ENV_KEY, + ): + self.assertNotIn(key, os.environ) + # Inherited parent env entries reach the subprocess env. env = mock_run.call_args.kwargs["env"] self.assertEqual(env.get(sentinel_key), sentinel_value) finally: From 6591d5255bad3a815031c461a30c64317273b40e Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 19 May 2026 20:42:33 +0000 Subject: [PATCH 3/4] remove --- gigl/utils/dev/__init__.py | 5 - gigl/utils/dev/submit_smoke_job.py | 258 ----------------------------- gigl/utils/dev/tb_smoke_main.py | 72 -------- 3 files changed, 335 deletions(-) delete mode 100644 gigl/utils/dev/__init__.py delete mode 100644 gigl/utils/dev/submit_smoke_job.py delete mode 100644 gigl/utils/dev/tb_smoke_main.py diff --git a/gigl/utils/dev/__init__.py b/gigl/utils/dev/__init__.py deleted file mode 100644 index 9c1bf25ab..000000000 --- a/gigl/utils/dev/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Developer utilities (smoke entrypoints, ad-hoc test helpers). - -Modules under this package are intended for short, ad-hoc test jobs and -developer iteration. They are NOT part of GiGL's stable public API. -""" diff --git a/gigl/utils/dev/submit_smoke_job.py b/gigl/utils/dev/submit_smoke_job.py deleted file mode 100644 index e1eab73ef..000000000 --- a/gigl/utils/dev/submit_smoke_job.py +++ /dev/null @@ -1,258 +0,0 @@ -"""Submit a tiny Vertex AI CustomJob that exercises GiGL's TensorBoard wiring. - -Goal: <2 min from "I changed launcher / writer code" to "I see whether TB -shows up." Bypasses ConfigPopulator and the full pipeline; uses the -production launcher path (``launch_single_pool_job``) so the same submit -logic runs as in real training. - -Required CLI flags: - --project GCP project (e.g. ``external-snap-ci-github-gigl``). - --region Vertex AI region (e.g. ``us-central1``). - --service-account Service account email used by the CustomJob. - --staging-bucket Regional GCS bucket Vertex stages artifacts under. - --tensorboard Full TensorBoard resource name - (``projects/.../locations/.../tensorboards/...``). - --experiment-name Vertex AI ``TensorboardExperiment`` name. The - tb_smoke_main entry point will pass this and the - --tensorboard value to ``TensorBoardWriter.create``. - --container-uri Container image to use. REQUIRED — must contain the - branch under test. - -Optional: - --job-name CustomJob display name. Defaults to a timestamped - ``gigl-tb-smoke-...``. - --dry-run Print the constructed submission parameters and - exit without submitting. - -Verification: - After the CustomJob completes the script polls the TensorBoard API - surface and asserts the user-named ``TensorboardExperiment`` exists - with at least one ``TensorboardRun`` containing time series data. - - The TB UI URL is printed for manual inspection. -""" - -from __future__ import annotations - -import argparse -import datetime -import re -import sys -import time - -from google.cloud import aiplatform - -from gigl.common import Uri -from gigl.common.logger import Logger -from gigl.src.common.constants.components import GiGLComponents -from gigl.src.common.types.pb_wrappers.gigl_resource_config import ( - GiglResourceConfigWrapper, -) -from gigl.src.common.vertex_ai_launcher import launch_single_pool_job -from snapchat.research.gbml import gigl_resource_config_pb2 - -logger = Logger() - -_TENSORBOARD_RESOURCE_NAME_PATTERN = re.compile( - r"^projects/(?P[^/]+)" - r"/locations/(?P[^/]+)" - r"/tensorboards/(?P[^/]+)$" -) - - -def _parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--project", required=True) - parser.add_argument("--region", required=True) - parser.add_argument("--service-account", required=True) - parser.add_argument( - "--staging-bucket", - required=True, - help="Regional GCS bucket (e.g. gs://gigl-cicd-temp).", - ) - parser.add_argument( - "--tensorboard", - required=True, - help="Full TensorBoard resource name.", - ) - parser.add_argument( - "--experiment-name", - required=True, - help=( - "TensorboardExperiment name. Passed to tb_smoke_main, which " - "creates the run under this experiment." - ), - ) - parser.add_argument( - "--container-uri", - required=True, - help=( - "Container image with the branch code. Required; pointing at a " - "released image would test stale code." - ), - ) - parser.add_argument("--job-name", default=None) - parser.add_argument("--dry-run", action="store_true") - return parser.parse_args() - - -def _build_resource_config( - *, - project: str, - region: str, - service_account: str, - staging_bucket: str, -) -> gigl_resource_config_pb2.GiglResourceConfig: - """Minimal GiglResourceConfig wired for a 1-replica CPU CustomJob.""" - common = gigl_resource_config_pb2.SharedResourceConfig.CommonComputeConfig( - project=project, - region=region, - temp_regional_assets_bucket=staging_bucket, - temp_assets_bucket=staging_bucket, - perm_assets_bucket=staging_bucket, - temp_assets_bq_dataset_name="not_used_by_smoke", - embedding_bq_dataset_name="not_used_by_smoke", - gcp_service_account_email=service_account, - dataflow_runner="DataflowRunner", - ) - shared = gigl_resource_config_pb2.SharedResourceConfig( - common_compute_config=common, - resource_labels={"cost_resource_group": "gigl_dev_smoke"}, - ) - trainer = gigl_resource_config_pb2.VertexAiResourceConfig( - # n1-standard-2 is rejected by Vertex AI; n1-standard-16 is the - # smallest spec we've confirmed accepted in dev. - machine_type="n1-standard-16", - gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", - gpu_limit=0, - num_replicas=1, - timeout=600, - ) - return gigl_resource_config_pb2.GiglResourceConfig( - shared_resource_config=shared, - trainer_resource_config=gigl_resource_config_pb2.TrainerResourceConfig( - vertex_ai_trainer_config=trainer, - ), - ) - - -def _verify_named_experiment( - *, - tensorboard_resource_name: str, - experiment_name: str, -) -> None: - """Confirm the chief-rank writer ingested events into the named experiment.""" - experiment_resource_name = ( - f"{tensorboard_resource_name}/experiments/{experiment_name}" - ) - runs = aiplatform.TensorboardRun.list( - tensorboard_experiment_name=experiment_resource_name, - ) - if not runs: - raise RuntimeError( - f"Named TensorboardExperiment {experiment_resource_name} has no " - "TensorboardRuns; the writer did not ingest events." - ) - for run in runs: - time_series = aiplatform.TensorboardTimeSeries.list( - tensorboard_run_name=run.resource_name, - ) - if not time_series: - raise RuntimeError( - f"Run {run.resource_name} has no TensorboardTimeSeries; " - "events did not reach the API." - ) - run_names = sorted(r.display_name for r in runs) - logger.info( - f"Named experiment OK: {len(runs)} run(s) under {experiment_resource_name}: " - f"{run_names}" - ) - - -def _print_tb_url( - *, - region: str, - project: str, - tensorboard_id: str, - experiment_name: str, -) -> None: - base = f"https://{region}.tensorboard.googleusercontent.com/experiment" - qualifier = f"projects+{project}+locations+{region}+tensorboards+{tensorboard_id}" - named = f"{base}/{qualifier}+experiments+{experiment_name}" - logger.info(f"Named TB URL: {named}") - - -def main() -> int: - args = _parse_args() - - tb_match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(args.tensorboard) - if not tb_match: - logger.error( - f"--tensorboard must be projects/.../locations/.../tensorboards/...; " - f"got {args.tensorboard!r}." - ) - return 2 - - timestamp = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") - job_name = args.job_name or f"gigl-tb-smoke-{timestamp}" - - resource_config = _build_resource_config( - project=args.project, - region=args.region, - service_account=args.service_account, - staging_bucket=args.staging_bucket, - ) - resource_wrapper = GiglResourceConfigWrapper(resource_config=resource_config) - - process_runtime_args = { - "tensorboard_resource_name": args.tensorboard, - "tensorboard_experiment_name": args.experiment_name, - } - - if args.dry_run: - logger.info( - "Dry run — would submit a CustomJob with:\n" - f" job_name = {job_name}\n" - f" container_uri = {args.container_uri}\n" - f" tensorboard_resource = {args.tensorboard}\n" - f" experiment_name = {args.experiment_name!r}\n" - f" process_runtime_args = {process_runtime_args}\n" - ) - return 0 - - aiplatform.init(project=args.project, location=args.region) - launch_single_pool_job( - vertex_ai_resource_config=resource_config.trainer_resource_config.vertex_ai_trainer_config, - job_name=job_name, - task_config_uri=Uri("gs://unused/by/smoke.yaml"), - resource_config_uri=Uri("gs://unused/by/smoke.yaml"), - process_command="python -m gigl.utils.dev.tb_smoke_main", - process_runtime_args=process_runtime_args, - resource_config_wrapper=resource_wrapper, - cpu_docker_uri=args.container_uri, - cuda_docker_uri=args.container_uri, - component=GiGLComponents.Trainer, - vertex_ai_region=args.region, - ) - logger.info(f"Submitted CustomJob: {job_name}") - - # CustomJob.submit blocks until completion inside launch_single_pool_job - # (see VertexAIService._submit_job: job.wait_for_completion). Give the - # backing TensorboardExperiment a short grace period for any final RPCs. - time.sleep(5) - - _verify_named_experiment( - tensorboard_resource_name=args.tensorboard, - experiment_name=args.experiment_name, - ) - _print_tb_url( - region=args.region, - project=args.project, - tensorboard_id=tb_match["tensorboard_id"], - experiment_name=args.experiment_name, - ) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/gigl/utils/dev/tb_smoke_main.py b/gigl/utils/dev/tb_smoke_main.py deleted file mode 100644 index 2d4a35807..000000000 --- a/gigl/utils/dev/tb_smoke_main.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Tiny smoke-test entrypoint that exercises GiGL's TensorBoard pipeline. - -Submitted as the container command by ``submit_smoke_job.py``. Constructs a -``TensorBoardWriter`` with ``enabled=True`` (single-process smoke = always -chief), writes a few scalar events, and exits. - -Configuration is plumbed via CLI flags injected by the launcher from the -smoke script's ``process_runtime_args`` map. All three are required: - - --job_name= - --tensorboard_resource_name= - --tensorboard_experiment_name= - -This entrypoint deliberately mirrors the production trainer/inferencer call -sites in ``examples/link_prediction/`` so the smoke test exercises the same -``TensorBoardWriter.create()`` code path. -""" - -from __future__ import annotations - -import argparse - -from gigl.common.logger import Logger -from gigl.utils.tensorboard_writer import TensorBoardWriter - -logger = Logger() - -_NUM_STEPS = 3 - - -def _parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--job_name", - required=True, - help="Used as the TensorboardRun ID (must be unique per launch).", - ) - parser.add_argument( - "--tensorboard_resource_name", - required=True, - help="Full Vertex AI Tensorboard resource name.", - ) - parser.add_argument( - "--tensorboard_experiment_name", - required=True, - help="TensorboardExperiment ID under the resource above.", - ) - # The launcher's _build_job_config always appends --task_config_uri, - # --resource_config_uri, and (on GPU) --use_cuda. The smoke entrypoint - # doesn't need them; use parse_known_args so they don't blow up argparse. - args, _unrecognized = parser.parse_known_args() - return args - - -def main() -> None: - """Write a handful of scalar events and exit.""" - args = _parse_args() - logger.info(f"Starting tb_smoke_main; job_name={args.job_name!r}") - with TensorBoardWriter.create( - resource_name=args.tensorboard_resource_name, - experiment_name=args.tensorboard_experiment_name, - experiment_run_name=args.job_name, - enabled=True, - ) as writer: - for step in range(_NUM_STEPS): - writer.log({"smoke/value": float(step)}, step=step) - logger.info(f"Wrote smoke/value={step} at step {step}") - logger.info("tb_smoke_main complete") - - -if __name__ == "__main__": - main() From ab83373cde16dd8b74f4d897ec0ff4a13c8b6d3d Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 19 May 2026 22:05:33 +0000 Subject: [PATCH 4/4] update --- gigl/env/constants.py | 1 - gigl/src/common/custom_launcher.py | 10 ---------- tests/unit/src/common/custom_launcher_test.py | 17 ----------------- 3 files changed, 28 deletions(-) diff --git a/gigl/env/constants.py b/gigl/env/constants.py index 21eb4c223..13686c64d 100644 --- a/gigl/env/constants.py +++ b/gigl/env/constants.py @@ -10,7 +10,6 @@ GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY: Final[str] = "GIGL_APPLIED_TASK_IDENTIFIER" GIGL_TASK_CONFIG_URI_ENV_KEY: Final[str] = "GIGL_TASK_CONFIG_URI" GIGL_RESOURCE_CONFIG_URI_ENV_KEY: Final[str] = "GIGL_RESOURCE_CONFIG_URI" -GIGL_PROCESS_COMMAND_ENV_KEY: Final[str] = "GIGL_PROCESS_COMMAND" GIGL_CPU_DOCKER_URI_ENV_KEY: Final[str] = "GIGL_CPU_DOCKER_URI" GIGL_CUDA_DOCKER_URI_ENV_KEY: Final[str] = "GIGL_CUDA_DOCKER_URI" GIGL_COMPONENT_ENV_KEY: Final[str] = "GIGL_COMPONENT" diff --git a/gigl/src/common/custom_launcher.py b/gigl/src/common/custom_launcher.py index 08770cd77..ebb79b4bd 100644 --- a/gigl/src/common/custom_launcher.py +++ b/gigl/src/common/custom_launcher.py @@ -24,7 +24,6 @@ import os import shlex import subprocess -from collections.abc import Mapping from typing import Optional from gigl.common import Uri @@ -38,7 +37,6 @@ GIGL_COMPONENT_ENV_KEY, GIGL_CPU_DOCKER_URI_ENV_KEY, GIGL_CUDA_DOCKER_URI_ENV_KEY, - GIGL_PROCESS_COMMAND_ENV_KEY, GIGL_RESOURCE_CONFIG_URI_ENV_KEY, GIGL_TASK_CONFIG_URI_ENV_KEY, ) @@ -57,8 +55,6 @@ def launch_custom( applied_task_identifier: str, task_config_uri: Uri, resource_config_uri: Uri, - process_command: str, - process_runtime_args: Mapping[str, str], cpu_docker_uri: Optional[str], cuda_docker_uri: Optional[str], component: GiGLComponents, @@ -94,11 +90,6 @@ def launch_custom( ``GIGL_TASK_CONFIG_URI`` (stringified). resource_config_uri: Exported to the subprocess as ``GIGL_RESOURCE_CONFIG_URI`` (stringified). - process_command: Exported to the subprocess as - ``GIGL_PROCESS_COMMAND``. - process_runtime_args: Accepted for API symmetry with the - GLT-side Vertex AI launchers but not currently exported — - there is no clean single-env-var encoding for a dict. cpu_docker_uri: Exported as ``GIGL_CPU_DOCKER_URI``. Falls back to ``DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU`` when ``None``. cuda_docker_uri: Exported as ``GIGL_CUDA_DOCKER_URI``. Falls @@ -126,7 +117,6 @@ def launch_custom( env[GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY] = applied_task_identifier env[GIGL_TASK_CONFIG_URI_ENV_KEY] = str(task_config_uri) env[GIGL_RESOURCE_CONFIG_URI_ENV_KEY] = str(resource_config_uri) - env[GIGL_PROCESS_COMMAND_ENV_KEY] = process_command env[GIGL_COMPONENT_ENV_KEY] = component.name env[GIGL_CPU_DOCKER_URI_ENV_KEY] = ( cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU diff --git a/tests/unit/src/common/custom_launcher_test.py b/tests/unit/src/common/custom_launcher_test.py index 3d325cbb5..05aae14b7 100644 --- a/tests/unit/src/common/custom_launcher_test.py +++ b/tests/unit/src/common/custom_launcher_test.py @@ -15,7 +15,6 @@ GIGL_COMPONENT_ENV_KEY, GIGL_CPU_DOCKER_URI_ENV_KEY, GIGL_CUDA_DOCKER_URI_ENV_KEY, - GIGL_PROCESS_COMMAND_ENV_KEY, GIGL_RESOURCE_CONFIG_URI_ENV_KEY, GIGL_TASK_CONFIG_URI_ENV_KEY, ) @@ -57,8 +56,6 @@ def test_dispatches_subprocess_with_literal_command_and_args( applied_task_identifier="job-42", task_config_uri=Uri("gs://bucket/task.yaml"), resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="ignored", - process_runtime_args={"ignored": "v"}, cpu_docker_uri="gcr.io/p/cpu:tag", cuda_docker_uri="gcr.io/p/cuda:tag", component=GiGLComponents.Trainer, @@ -82,8 +79,6 @@ def test_empty_command_raises_value_error(self, mock_run: MagicMock) -> None: applied_task_identifier="job", task_config_uri=Uri("gs://bucket/task.yaml"), resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="", - process_runtime_args={}, cpu_docker_uri=None, cuda_docker_uri=None, component=GiGLComponents.Trainer, @@ -99,8 +94,6 @@ def test_invalid_component_raises_value_error(self, mock_run: MagicMock) -> None applied_task_identifier="job", task_config_uri=Uri("gs://bucket/task.yaml"), resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="echo 'hello, world!", - process_runtime_args={}, cpu_docker_uri=None, cuda_docker_uri=None, component=GiGLComponents.DataPreprocessor, @@ -115,8 +108,6 @@ def test_args_with_spaces_are_shell_quoted(self, mock_run: MagicMock) -> None: applied_task_identifier="job", task_config_uri=Uri("gs://bucket/task.yaml"), resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="", - process_runtime_args={}, cpu_docker_uri=None, cuda_docker_uri=None, component=GiGLComponents.Trainer, @@ -135,8 +126,6 @@ def test_dispatch_sets_gigl_env_vars(self, mock_run: MagicMock) -> None: applied_task_identifier="job-42", task_config_uri=Uri("gs://bucket/task.yaml"), resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="python -m my.cli", - process_runtime_args={}, cpu_docker_uri="gcr.io/p/cpu:tag", cuda_docker_uri="gcr.io/p/cuda:tag", component=GiGLComponents.Trainer, @@ -147,7 +136,6 @@ def test_dispatch_sets_gigl_env_vars(self, mock_run: MagicMock) -> None: self.assertEqual( env[GIGL_RESOURCE_CONFIG_URI_ENV_KEY], "gs://bucket/resource.yaml" ) - self.assertEqual(env[GIGL_PROCESS_COMMAND_ENV_KEY], "python -m my.cli") self.assertEqual(env[GIGL_CPU_DOCKER_URI_ENV_KEY], "gcr.io/p/cpu:tag") self.assertEqual(env[GIGL_CUDA_DOCKER_URI_ENV_KEY], "gcr.io/p/cuda:tag") # component is exported via .name (the enum member identifier). @@ -163,8 +151,6 @@ def test_dispatch_defaults_optional_uris_to_release_images( applied_task_identifier="job", task_config_uri=Uri("gs://bucket/task.yaml"), resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="echo", - process_runtime_args={}, cpu_docker_uri=None, cuda_docker_uri=None, component=GiGLComponents.Inferencer, @@ -196,8 +182,6 @@ def test_dispatch_isolates_subprocess_env_from_parent( applied_task_identifier="job", task_config_uri=Uri("gs://bucket/task.yaml"), resource_config_uri=Uri("gs://bucket/resource.yaml"), - process_command="echo", - process_runtime_args={}, cpu_docker_uri="gcr.io/p/cpu:tag", cuda_docker_uri="gcr.io/p/cuda:tag", component=GiGLComponents.Trainer, @@ -209,7 +193,6 @@ def test_dispatch_isolates_subprocess_env_from_parent( GIGL_APPLIED_TASK_IDENTIFIER_ENV_KEY, GIGL_TASK_CONFIG_URI_ENV_KEY, GIGL_RESOURCE_CONFIG_URI_ENV_KEY, - GIGL_PROCESS_COMMAND_ENV_KEY, GIGL_CPU_DOCKER_URI_ENV_KEY, GIGL_CUDA_DOCKER_URI_ENV_KEY, GIGL_COMPONENT_ENV_KEY,