Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions gigl/src/validation_check/config_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
assert_trained_model_exists,
)
from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import (
check_custom_launcher_config_requires_glt_backend,
check_inferencer_graph_store_compatibility,
check_trainer_graph_store_compatibility,
)
from gigl.src.validation_check.libs.name_checks import (
check_if_kfp_pipeline_job_name_valid,
)
from gigl.src.validation_check.libs.resource_config_checks import (
check_custom_launcher_config_shape,
check_if_inferencer_resource_config_valid,
check_if_preprocessor_resource_config_valid,
check_if_shared_resource_config_valid,
Expand Down Expand Up @@ -202,25 +204,31 @@
GiGLComponents.ConfigPopulator.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.DataPreprocessor.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.SubgraphSampler.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.SplitGenerator.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.Trainer.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.Inferencer.value: [
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
# PostProcessor doesn't need graph store compatibility checks
}
Expand Down Expand Up @@ -347,6 +355,15 @@ def kfp_validation_checks(
resource_config_wrapper=resource_config_wrapper,
)

# Validate any populated CustomLauncherConfig has a non-empty command.
# Unconditional — the check is shape-only and does not call out to any
# external service.
for component in (GiGLComponents.Trainer, GiGLComponents.Inferencer):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we already have an existing pattern to check for these in resource_config_checks - lets adopt here.

check_custom_launcher_config_shape(
resource_config_pb=resource_config_pb,
component=component,
)

# check if trained model file exist when skipping training
if gbml_config_pb.shared_config.should_skip_training == True:
assert_trained_model_exists(gbml_config_pb=gbml_config_pb)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,63 @@ def check_inferencer_graph_store_compatibility(
raise AssertionError(
f"If one of GbmlConfig.inferencer_config.graph_store_storage_config or GiglResourceConfig.inferencer_resource_config is set, the other must also be set. GbmlConfig.inferencer_config.graph_store_storage_config is set: {gbml_has_graph_store}, GiglResourceConfig.inferencer_resource_config is set: {resource_has_graph_store}."
)


def check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper: GbmlConfigPbWrapper,
resource_config_wrapper: GiglResourceConfigWrapper,
) -> None:
"""Enforce that ``CustomLauncherConfig`` is only used with the GLT (v2) backend.

The v1 trainer/inferencer dispatchers never consult the
``custom_trainer_config`` / ``custom_inferencer_config`` oneof, so pairing
a ``CustomLauncherConfig`` with a task config that has
``should_use_glt_backend=False`` would silently fall through the v1 path
and fail at runtime. Catch it up-front here so the failure is loud and
actionable at validation time.
Comment on lines +146 to +151
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets update the agent generated comments/docs so that they are meant for future readers and not just the author of the PR?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably the value error raised below is self documenting this function - this comment seems to verbose to add value.


Note on naming: the wrapper exposes ``should_use_glt_backend`` (bool) but
the raw YAML key users set is ``feature_flags.should_run_glt_backend``.
The wrapper translates one into the other; this check always reads the
wrapper property and never the raw map.

Args:
gbml_config_pb_wrapper: The GbmlConfig wrapper (template config).
resource_config_wrapper: The GiglResourceConfig wrapper (resource config).

Raises:
ValueError: If either the trainer or inferencer resource config is a
``CustomLauncherConfig`` and ``should_use_glt_backend`` is False.
"""
logger.info(
"Config validation check: CustomLauncherConfig requires GLT (v2) backend."
)
trainer_is_custom = isinstance(
resource_config_wrapper.trainer_config,
gigl_resource_config_pb2.CustomLauncherConfig,
)
inferencer_is_custom = isinstance(
resource_config_wrapper.inferencer_config,
gigl_resource_config_pb2.CustomLauncherConfig,
)
if not (trainer_is_custom or inferencer_is_custom):
return

if not gbml_config_pb_wrapper.should_use_glt_backend:
offending: list[str] = []
if trainer_is_custom:
offending.append("trainer_resource_config.custom_trainer_config")
if inferencer_is_custom:
offending.append("inferencer_resource_config.custom_inferencer_config")
raise ValueError(
"CustomLauncherConfig is only wired into the GLT (v2) dispatchers "
"(glt_trainer.py / glt_inferencer.py); the v1 trainer/inferencer "
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

v1 and v2 may be very arbritary to reader, and this message is too verbose and doesn't get to the point fast enough - a more useful message (that also simplifies above logic):

raise ValueError("""
CustomLauncherConfig can only be used w/ GLT backend. Detected 
trainer_resource_config using custom implementation: {trainer_is_custom} ; 
inference_resource_config using custom implementation: {inference_is_custom}. 
Ensure feature_flags.should_run_glt_backend is set to 'True' in the task config.
""")

"never consult the custom oneof and would fall through to an "
"'Unsupported resource config' error at runtime. The following "
f"custom resource configs were set: {offending}, but the task "
"config has should_use_glt_backend=False (raw YAML key: "
"feature_flags.should_run_glt_backend). Either set "
"feature_flags.should_run_glt_backend='True' in the task config, "
"or replace the CustomLauncherConfig with a built-in resource "
"config."
)
52 changes: 52 additions & 0 deletions gigl/src/validation_check/libs/resource_config_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from google.cloud.aiplatform_v1.types.accelerator_type import AcceleratorType

from gigl.common.logger import Logger
from gigl.src.common.constants.components import GiGLComponents
from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper
from gigl.src.common.types.pb_wrappers.gigl_resource_config import (
GiglResourceConfigWrapper,
Expand Down Expand Up @@ -312,6 +313,57 @@ def _validate_machine_config(
)


def check_custom_launcher_config_shape(
resource_config_pb: gigl_resource_config_pb2.GiglResourceConfig,
component: GiGLComponents,
) -> None:
"""Assert the trainer / inferencer's ``CustomLauncherConfig`` is well-shaped.

Resolves the component's resource config through the wrapper; if it is not
a ``CustomLauncherConfig`` this helper is a no-op. Otherwise it asserts
``command`` is non-empty so the launcher's runtime guard does not fail
on a typo in the YAML.

Args:
resource_config_pb: The resource config to inspect. The trainer or
inferencer oneof (depending on ``component``) is pulled out of the
wrapper.
component: Which GiGL component to check. Must be Trainer or
Inferencer; other components never carry a ``CustomLauncherConfig``.

Raises:
ValueError: If ``component`` is not Trainer or Inferencer, or if a
populated ``CustomLauncherConfig`` has an empty ``command``.
"""
if component not in {GiGLComponents.Trainer, GiGLComponents.Inferencer}:
raise ValueError(
f"check_custom_launcher_config_shape only supports "
f"Trainer and Inferencer components; got {component}."
)

wrapper = GiglResourceConfigWrapper(resource_config=resource_config_pb)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the caller already has access to the wrapper - why not just pass that through?

component_config: Union[
gigl_resource_config_pb2.LocalResourceConfig,
gigl_resource_config_pb2.VertexAiResourceConfig,
gigl_resource_config_pb2.KFPResourceConfig,
gigl_resource_config_pb2.VertexAiGraphStoreConfig,
gigl_resource_config_pb2.DataflowResourceConfig,
gigl_resource_config_pb2.CustomLauncherConfig,
]
if component == GiGLComponents.Trainer:
component_config = wrapper.trainer_config
else:
component_config = wrapper.inferencer_config

if not isinstance(component_config, gigl_resource_config_pb2.CustomLauncherConfig):
return

if not component_config.command:
raise ValueError(
f"CustomLauncherConfig.command must be set for {component.value}."
)


def check_if_trainer_graph_store_storage_command_valid(
gbml_config_pb_wrapper: GbmlConfigPbWrapper,
) -> None:
Expand Down
56 changes: 56 additions & 0 deletions tests/unit/src/validation/config_validator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,5 +352,61 @@ def test_resource_config_validation_failure_with_mock_configs(
)


class TestCustomLauncherConfigInValidationChain(TestCase):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like a weird test to add.
i.e. (1) ensure that some function is calling some other function - (2) and do that using some knowledge of the error message that should be propagated.

If we start adopting this test pattern even if we just adopt (1); what is the stopping point? Our tests would be too verbose and would not be valuable. If instead you wanted to migrate our unit tests introduced above to an order higher i.e. here - that would be valuable. But honestly, I would prefer not having this at all unless its a core library / performance functionality.

"""Confirm the new CustomLauncherConfig gates are wired into the chain.

The shape check (``check_custom_launcher_config_shape``) runs
unconditionally inside ``kfp_validation_checks``. A resource config
with a ``CustomLauncherConfig`` trainer that has an empty ``command``
must surface a ``ValueError`` from the chain, not a launcher-side
runtime failure.
"""

def setUp(self):
gigl.env.pipelines_config._resource_config = None
self._temp_dir = tempfile.mkdtemp()
self._proto_utils = ProtoUtils()

def tearDown(self):
shutil.rmtree(self._temp_dir, ignore_errors=True)
gigl.env.pipelines_config._resource_config = None

def _write_proto_to_file(
self, proto: google.protobuf.message.Message, filename: str
) -> Uri:
filepath = os.path.join(self._temp_dir, filename)
uri = UriFactory.create_uri(filepath)
self._proto_utils.write_proto_to_yaml(proto, uri)
return uri

def test_empty_custom_trainer_command_raises_via_chain(self) -> None:
# Build a live-SGS task config (so the chain runs through
# CustomLauncherConfig-aware paths).
task_config_uri = self._write_proto_to_file(
_create_valid_live_subgraph_sampling_task_config(),
"live_task_config.yaml",
)

# Resource config with a CustomLauncherConfig trainer whose
# command is empty — the new shape check must catch it.
resource_config = _create_valid_live_subgraph_sampling_resource_config()
resource_config.trainer_resource_config.ClearField("trainer_config")
resource_config.trainer_resource_config.custom_trainer_config.command = ""
resource_config_uri = self._write_proto_to_file(
resource_config, "live_custom_empty_resource_config.yaml"
)

with self.assertRaises(ValueError) as ctx:
kfp_validation_checks(
job_name="custom_launcher_config_chain_test",
task_config_uri=task_config_uri,
start_at="config_populator",
resource_config_uri=resource_config_uri,
)
# Error message must come from check_custom_launcher_config_shape,
# not from a generic shape check elsewhere.
self.assertIn("CustomLauncherConfig.command", str(ctx.exception))


if __name__ == "__main__":
absltest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
GiglResourceConfigWrapper,
)
from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import (
check_custom_launcher_config_requires_glt_backend,
check_inferencer_graph_store_compatibility,
check_trainer_graph_store_compatibility,
)
from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2
from tests.test_assets.test_case import TestCase

# Placeholder shell snippet used by CustomLauncherConfig fixtures in this
# module — these tests only exercise type-of-config dispatch, not actual
# subprocess execution.
_FAKE_COMMAND = "echo fake"

# Helper functions for creating VertexAiGraphStoreConfig


Expand Down Expand Up @@ -214,5 +220,107 @@ def test_resource_has_inferencer_graph_store_template_does_not(self):
)


# Helper functions for custom + glt-backend compatibility tests


def _create_gbml_config_with_glt_flag(value: str) -> GbmlConfigPbWrapper:
"""Create a GbmlConfig whose feature_flags.should_run_glt_backend is set.

Note the raw YAML key is ``should_run_glt_backend`` (not
``should_use_glt_backend``). The wrapper's ``should_use_glt_backend``
property reads this key from the ``feature_flags`` map and converts it to
a bool.
"""
gbml_config = gbml_config_pb2.GbmlConfig()
gbml_config.feature_flags["should_run_glt_backend"] = value
return GbmlConfigPbWrapper(gbml_config_pb=gbml_config)


def _create_resource_config_with_custom_trainer() -> GiglResourceConfigWrapper:
"""Create a GiglResourceConfig whose trainer is a CustomLauncherConfig."""
config = gigl_resource_config_pb2.GiglResourceConfig()
_create_shared_resource_config(config)
config.trainer_resource_config.custom_trainer_config.command = _FAKE_COMMAND
# Inferencer uses a built-in config so only the trainer path is custom.
config.inferencer_resource_config.vertex_ai_inferencer_config.CopyFrom(
_create_vertex_ai_resource_config()
)
return GiglResourceConfigWrapper(resource_config=config)


def _create_resource_config_with_custom_inferencer() -> GiglResourceConfigWrapper:
"""Create a GiglResourceConfig whose inferencer is a CustomLauncherConfig."""
config = gigl_resource_config_pb2.GiglResourceConfig()
_create_shared_resource_config(config)
config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom(
_create_vertex_ai_resource_config()
)
config.inferencer_resource_config.custom_inferencer_config.command = _FAKE_COMMAND
return GiglResourceConfigWrapper(resource_config=config)


class TestCustomLauncherConfigRequiresGltBackend(TestCase):
"""Test suite for the CustomLauncherConfig + GLT-backend compatibility guard.

Because v1 trainer/inferencer dispatchers don't consult the custom oneof,
pairing a ``CustomLauncherConfig`` with
``feature_flags.should_run_glt_backend = "False"`` must be caught at
validation time rather than at runtime.
"""

def test_custom_trainer_without_glt_raises(self):
"""CustomLauncherConfig trainer + glt=False raises a clear ValueError."""
gbml_config = _create_gbml_config_with_glt_flag("False")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: We are writing a function here to save 1 line of code but introducing a layer of indirection as a result.

gbml_config = gbml_config_pb2.GbmlConfig()
gbml_config.feature_flags["should_run_glt_backend"] = False

much more readable.

resource_config = _create_resource_config_with_custom_trainer()
with self.assertRaises(ValueError) as ctx:
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)
self.assertIn("should_run_glt_backend", str(ctx.exception))
self.assertIn("custom_trainer_config", str(ctx.exception))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing string matching in the error message feels too verbose to be helpful - unless you were testing on the error message creation logic itself, which it doesnt seem like we are.


def test_custom_inferencer_without_glt_raises(self):
"""CustomLauncherConfig inferencer + glt=False raises a clear ValueError."""
gbml_config = _create_gbml_config_with_glt_flag("False")
resource_config = _create_resource_config_with_custom_inferencer()
with self.assertRaises(ValueError) as ctx:
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)
self.assertIn("custom_inferencer_config", str(ctx.exception))

def test_custom_trainer_with_glt_passes(self):
"""CustomLauncherConfig trainer + glt=True passes validation."""
gbml_config = _create_gbml_config_with_glt_flag("True")
resource_config = _create_resource_config_with_custom_trainer()
# Should not raise any exception
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)

def test_custom_inferencer_with_glt_passes(self):
"""CustomLauncherConfig inferencer + glt=True passes validation."""
gbml_config = _create_gbml_config_with_glt_flag("True")
resource_config = _create_resource_config_with_custom_inferencer()
# Should not raise any exception
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)

def test_no_custom_config_passes_without_glt(self):
"""No CustomLauncherConfig at all passes regardless of glt flag."""
gbml_config = _create_gbml_config_with_glt_flag("False")
resource_config = _create_resource_config_without_graph_stores()
# Should not raise any exception: no custom oneof means nothing to enforce.
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)


if __name__ == "__main__":
absltest.main()
Loading