-
Notifications
You must be signed in to change notification settings - Fork 15
Validate CustomResourceConfig shape + GLT-backend compatibility #627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2b69b8f
ddbafc2
0425547
066c473
5742480
b606b28
844c434
937276f
d0fb63f
a3b8e82
f23fb44
4d583d4
009a7af
7d5ddf0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,3 +135,63 @@ def check_inferencer_graph_store_compatibility( | |
| raise AssertionError( | ||
| f"If one of GbmlConfig.inferencer_config.graph_store_storage_config or GiglResourceConfig.inferencer_resource_config is set, the other must also be set. GbmlConfig.inferencer_config.graph_store_storage_config is set: {gbml_has_graph_store}, GiglResourceConfig.inferencer_resource_config is set: {resource_has_graph_store}." | ||
| ) | ||
|
|
||
|
|
||
| def check_custom_launcher_config_requires_glt_backend( | ||
| gbml_config_pb_wrapper: GbmlConfigPbWrapper, | ||
| resource_config_wrapper: GiglResourceConfigWrapper, | ||
| ) -> None: | ||
| """Enforce that ``CustomLauncherConfig`` is only used with the GLT (v2) backend. | ||
|
|
||
| The v1 trainer/inferencer dispatchers never consult the | ||
| ``custom_trainer_config`` / ``custom_inferencer_config`` oneof, so pairing | ||
| a ``CustomLauncherConfig`` with a task config that has | ||
| ``should_use_glt_backend=False`` would silently fall through the v1 path | ||
| and fail at runtime. Catch it up-front here so the failure is loud and | ||
| actionable at validation time. | ||
|
Comment on lines
+146
to
+151
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets update the agent generated comments/docs so that they are meant for future readers and not just the author of the PR?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably the value error raised below is self documenting this function - this comment seems to verbose to add value. |
||
|
|
||
| Note on naming: the wrapper exposes ``should_use_glt_backend`` (bool) but | ||
| the raw YAML key users set is ``feature_flags.should_run_glt_backend``. | ||
| The wrapper translates one into the other; this check always reads the | ||
| wrapper property and never the raw map. | ||
|
|
||
| Args: | ||
| gbml_config_pb_wrapper: The GbmlConfig wrapper (template config). | ||
| resource_config_wrapper: The GiglResourceConfig wrapper (resource config). | ||
|
|
||
| Raises: | ||
| ValueError: If either the trainer or inferencer resource config is a | ||
| ``CustomLauncherConfig`` and ``should_use_glt_backend`` is False. | ||
| """ | ||
| logger.info( | ||
| "Config validation check: CustomLauncherConfig requires GLT (v2) backend." | ||
| ) | ||
| trainer_is_custom = isinstance( | ||
| resource_config_wrapper.trainer_config, | ||
| gigl_resource_config_pb2.CustomLauncherConfig, | ||
| ) | ||
| inferencer_is_custom = isinstance( | ||
| resource_config_wrapper.inferencer_config, | ||
| gigl_resource_config_pb2.CustomLauncherConfig, | ||
| ) | ||
| if not (trainer_is_custom or inferencer_is_custom): | ||
| return | ||
|
|
||
| if not gbml_config_pb_wrapper.should_use_glt_backend: | ||
| offending: list[str] = [] | ||
| if trainer_is_custom: | ||
| offending.append("trainer_resource_config.custom_trainer_config") | ||
| if inferencer_is_custom: | ||
| offending.append("inferencer_resource_config.custom_inferencer_config") | ||
| raise ValueError( | ||
| "CustomLauncherConfig is only wired into the GLT (v2) dispatchers " | ||
| "(glt_trainer.py / glt_inferencer.py); the v1 trainer/inferencer " | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. v1 and v2 may be very arbritary to reader, and this message is too verbose and doesn't get to the point fast enough - a more useful message (that also simplifies above logic): raise ValueError("""
CustomLauncherConfig can only be used w/ GLT backend. Detected
trainer_resource_config using custom implementation: {trainer_is_custom} ;
inference_resource_config using custom implementation: {inference_is_custom}.
Ensure feature_flags.should_run_glt_backend is set to 'True' in the task config.
""") |
||
| "never consult the custom oneof and would fall through to an " | ||
| "'Unsupported resource config' error at runtime. The following " | ||
| f"custom resource configs were set: {offending}, but the task " | ||
| "config has should_use_glt_backend=False (raw YAML key: " | ||
| "feature_flags.should_run_glt_backend). Either set " | ||
| "feature_flags.should_run_glt_backend='True' in the task config, " | ||
| "or replace the CustomLauncherConfig with a built-in resource " | ||
| "config." | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| from google.cloud.aiplatform_v1.types.accelerator_type import AcceleratorType | ||
|
|
||
| from gigl.common.logger import Logger | ||
| from gigl.src.common.constants.components import GiGLComponents | ||
| from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper | ||
| from gigl.src.common.types.pb_wrappers.gigl_resource_config import ( | ||
| GiglResourceConfigWrapper, | ||
|
|
@@ -312,6 +313,57 @@ def _validate_machine_config( | |
| ) | ||
|
|
||
|
|
||
| def check_custom_launcher_config_shape( | ||
| resource_config_pb: gigl_resource_config_pb2.GiglResourceConfig, | ||
| component: GiGLComponents, | ||
| ) -> None: | ||
| """Assert the trainer / inferencer's ``CustomLauncherConfig`` is well-shaped. | ||
|
|
||
| Resolves the component's resource config through the wrapper; if it is not | ||
| a ``CustomLauncherConfig`` this helper is a no-op. Otherwise it asserts | ||
| ``command`` is non-empty so the launcher's runtime guard does not fail | ||
| on a typo in the YAML. | ||
|
|
||
| Args: | ||
| resource_config_pb: The resource config to inspect. The trainer or | ||
| inferencer oneof (depending on ``component``) is pulled out of the | ||
| wrapper. | ||
| component: Which GiGL component to check. Must be Trainer or | ||
| Inferencer; other components never carry a ``CustomLauncherConfig``. | ||
|
|
||
| Raises: | ||
| ValueError: If ``component`` is not Trainer or Inferencer, or if a | ||
| populated ``CustomLauncherConfig`` has an empty ``command``. | ||
| """ | ||
| if component not in {GiGLComponents.Trainer, GiGLComponents.Inferencer}: | ||
| raise ValueError( | ||
| f"check_custom_launcher_config_shape only supports " | ||
| f"Trainer and Inferencer components; got {component}." | ||
| ) | ||
|
|
||
| wrapper = GiglResourceConfigWrapper(resource_config=resource_config_pb) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the caller already has access to the wrapper - why not just pass that through? |
||
| component_config: Union[ | ||
| gigl_resource_config_pb2.LocalResourceConfig, | ||
| gigl_resource_config_pb2.VertexAiResourceConfig, | ||
| gigl_resource_config_pb2.KFPResourceConfig, | ||
| gigl_resource_config_pb2.VertexAiGraphStoreConfig, | ||
| gigl_resource_config_pb2.DataflowResourceConfig, | ||
| gigl_resource_config_pb2.CustomLauncherConfig, | ||
| ] | ||
| if component == GiGLComponents.Trainer: | ||
| component_config = wrapper.trainer_config | ||
| else: | ||
| component_config = wrapper.inferencer_config | ||
|
|
||
| if not isinstance(component_config, gigl_resource_config_pb2.CustomLauncherConfig): | ||
| return | ||
|
|
||
| if not component_config.command: | ||
| raise ValueError( | ||
| f"CustomLauncherConfig.command must be set for {component.value}." | ||
| ) | ||
|
|
||
|
|
||
| def check_if_trainer_graph_store_storage_command_valid( | ||
| gbml_config_pb_wrapper: GbmlConfigPbWrapper, | ||
| ) -> None: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -352,5 +352,61 @@ def test_resource_config_validation_failure_with_mock_configs( | |
| ) | ||
|
|
||
|
|
||
| class TestCustomLauncherConfigInValidationChain(TestCase): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels like a weird test to add. If we start adopting this test pattern even if we just adopt (1); what is the stopping point? Our tests would be too verbose and would not be valuable. If instead you wanted to migrate our unit tests introduced above to an order higher i.e. here - that would be valuable. But honestly, I would prefer not having this at all unless its a core library / performance functionality. |
||
| """Confirm the new CustomLauncherConfig gates are wired into the chain. | ||
|
|
||
| The shape check (``check_custom_launcher_config_shape``) runs | ||
| unconditionally inside ``kfp_validation_checks``. A resource config | ||
| with a ``CustomLauncherConfig`` trainer that has an empty ``command`` | ||
| must surface a ``ValueError`` from the chain, not a launcher-side | ||
| runtime failure. | ||
| """ | ||
|
|
||
| def setUp(self): | ||
| gigl.env.pipelines_config._resource_config = None | ||
| self._temp_dir = tempfile.mkdtemp() | ||
| self._proto_utils = ProtoUtils() | ||
|
|
||
| def tearDown(self): | ||
| shutil.rmtree(self._temp_dir, ignore_errors=True) | ||
| gigl.env.pipelines_config._resource_config = None | ||
|
|
||
| def _write_proto_to_file( | ||
| self, proto: google.protobuf.message.Message, filename: str | ||
| ) -> Uri: | ||
| filepath = os.path.join(self._temp_dir, filename) | ||
| uri = UriFactory.create_uri(filepath) | ||
| self._proto_utils.write_proto_to_yaml(proto, uri) | ||
| return uri | ||
|
|
||
| def test_empty_custom_trainer_command_raises_via_chain(self) -> None: | ||
| # Build a live-SGS task config (so the chain runs through | ||
| # CustomLauncherConfig-aware paths). | ||
| task_config_uri = self._write_proto_to_file( | ||
| _create_valid_live_subgraph_sampling_task_config(), | ||
| "live_task_config.yaml", | ||
| ) | ||
|
|
||
| # Resource config with a CustomLauncherConfig trainer whose | ||
| # command is empty — the new shape check must catch it. | ||
| resource_config = _create_valid_live_subgraph_sampling_resource_config() | ||
| resource_config.trainer_resource_config.ClearField("trainer_config") | ||
| resource_config.trainer_resource_config.custom_trainer_config.command = "" | ||
| resource_config_uri = self._write_proto_to_file( | ||
| resource_config, "live_custom_empty_resource_config.yaml" | ||
| ) | ||
|
|
||
| with self.assertRaises(ValueError) as ctx: | ||
| kfp_validation_checks( | ||
| job_name="custom_launcher_config_chain_test", | ||
| task_config_uri=task_config_uri, | ||
| start_at="config_populator", | ||
| resource_config_uri=resource_config_uri, | ||
| ) | ||
| # Error message must come from check_custom_launcher_config_shape, | ||
| # not from a generic shape check elsewhere. | ||
| self.assertIn("CustomLauncherConfig.command", str(ctx.exception)) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| absltest.main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,12 +5,18 @@ | |
| GiglResourceConfigWrapper, | ||
| ) | ||
| from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import ( | ||
| check_custom_launcher_config_requires_glt_backend, | ||
| check_inferencer_graph_store_compatibility, | ||
| check_trainer_graph_store_compatibility, | ||
| ) | ||
| from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 | ||
| from tests.test_assets.test_case import TestCase | ||
|
|
||
| # Placeholder shell snippet used by CustomLauncherConfig fixtures in this | ||
| # module — these tests only exercise type-of-config dispatch, not actual | ||
| # subprocess execution. | ||
| _FAKE_COMMAND = "echo fake" | ||
|
|
||
| # Helper functions for creating VertexAiGraphStoreConfig | ||
|
|
||
|
|
||
|
|
@@ -214,5 +220,107 @@ def test_resource_has_inferencer_graph_store_template_does_not(self): | |
| ) | ||
|
|
||
|
|
||
| # Helper functions for custom + glt-backend compatibility tests | ||
|
|
||
|
|
||
| def _create_gbml_config_with_glt_flag(value: str) -> GbmlConfigPbWrapper: | ||
| """Create a GbmlConfig whose feature_flags.should_run_glt_backend is set. | ||
|
|
||
| Note the raw YAML key is ``should_run_glt_backend`` (not | ||
| ``should_use_glt_backend``). The wrapper's ``should_use_glt_backend`` | ||
| property reads this key from the ``feature_flags`` map and converts it to | ||
| a bool. | ||
| """ | ||
| gbml_config = gbml_config_pb2.GbmlConfig() | ||
| gbml_config.feature_flags["should_run_glt_backend"] = value | ||
| return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) | ||
|
|
||
|
|
||
| def _create_resource_config_with_custom_trainer() -> GiglResourceConfigWrapper: | ||
| """Create a GiglResourceConfig whose trainer is a CustomLauncherConfig.""" | ||
| config = gigl_resource_config_pb2.GiglResourceConfig() | ||
| _create_shared_resource_config(config) | ||
| config.trainer_resource_config.custom_trainer_config.command = _FAKE_COMMAND | ||
| # Inferencer uses a built-in config so only the trainer path is custom. | ||
| config.inferencer_resource_config.vertex_ai_inferencer_config.CopyFrom( | ||
| _create_vertex_ai_resource_config() | ||
| ) | ||
| return GiglResourceConfigWrapper(resource_config=config) | ||
|
|
||
|
|
||
| def _create_resource_config_with_custom_inferencer() -> GiglResourceConfigWrapper: | ||
| """Create a GiglResourceConfig whose inferencer is a CustomLauncherConfig.""" | ||
| config = gigl_resource_config_pb2.GiglResourceConfig() | ||
| _create_shared_resource_config(config) | ||
| config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom( | ||
| _create_vertex_ai_resource_config() | ||
| ) | ||
| config.inferencer_resource_config.custom_inferencer_config.command = _FAKE_COMMAND | ||
| return GiglResourceConfigWrapper(resource_config=config) | ||
|
|
||
|
|
||
| class TestCustomLauncherConfigRequiresGltBackend(TestCase): | ||
| """Test suite for the CustomLauncherConfig + GLT-backend compatibility guard. | ||
|
|
||
| Because v1 trainer/inferencer dispatchers don't consult the custom oneof, | ||
| pairing a ``CustomLauncherConfig`` with | ||
| ``feature_flags.should_run_glt_backend = "False"`` must be caught at | ||
| validation time rather than at runtime. | ||
| """ | ||
|
|
||
| def test_custom_trainer_without_glt_raises(self): | ||
| """CustomLauncherConfig trainer + glt=False raises a clear ValueError.""" | ||
| gbml_config = _create_gbml_config_with_glt_flag("False") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: We are writing a function here to save 1 line of code but introducing a layer of indirection as a result. gbml_config = gbml_config_pb2.GbmlConfig()
gbml_config.feature_flags["should_run_glt_backend"] = Falsemuch more readable. |
||
| resource_config = _create_resource_config_with_custom_trainer() | ||
| with self.assertRaises(ValueError) as ctx: | ||
| check_custom_launcher_config_requires_glt_backend( | ||
| gbml_config_pb_wrapper=gbml_config, | ||
| resource_config_wrapper=resource_config, | ||
| ) | ||
| self.assertIn("should_run_glt_backend", str(ctx.exception)) | ||
| self.assertIn("custom_trainer_config", str(ctx.exception)) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doing string matching in the error message feels too verbose to be helpful - unless you were testing on the error message creation logic itself, which it doesnt seem like we are. |
||
|
|
||
| def test_custom_inferencer_without_glt_raises(self): | ||
| """CustomLauncherConfig inferencer + glt=False raises a clear ValueError.""" | ||
| gbml_config = _create_gbml_config_with_glt_flag("False") | ||
| resource_config = _create_resource_config_with_custom_inferencer() | ||
| with self.assertRaises(ValueError) as ctx: | ||
| check_custom_launcher_config_requires_glt_backend( | ||
| gbml_config_pb_wrapper=gbml_config, | ||
| resource_config_wrapper=resource_config, | ||
| ) | ||
| self.assertIn("custom_inferencer_config", str(ctx.exception)) | ||
|
|
||
| def test_custom_trainer_with_glt_passes(self): | ||
| """CustomLauncherConfig trainer + glt=True passes validation.""" | ||
| gbml_config = _create_gbml_config_with_glt_flag("True") | ||
| resource_config = _create_resource_config_with_custom_trainer() | ||
| # Should not raise any exception | ||
| check_custom_launcher_config_requires_glt_backend( | ||
| gbml_config_pb_wrapper=gbml_config, | ||
| resource_config_wrapper=resource_config, | ||
| ) | ||
|
|
||
| def test_custom_inferencer_with_glt_passes(self): | ||
| """CustomLauncherConfig inferencer + glt=True passes validation.""" | ||
| gbml_config = _create_gbml_config_with_glt_flag("True") | ||
| resource_config = _create_resource_config_with_custom_inferencer() | ||
| # Should not raise any exception | ||
| check_custom_launcher_config_requires_glt_backend( | ||
| gbml_config_pb_wrapper=gbml_config, | ||
| resource_config_wrapper=resource_config, | ||
| ) | ||
|
|
||
| def test_no_custom_config_passes_without_glt(self): | ||
| """No CustomLauncherConfig at all passes regardless of glt flag.""" | ||
| gbml_config = _create_gbml_config_with_glt_flag("False") | ||
| resource_config = _create_resource_config_without_graph_stores() | ||
| # Should not raise any exception: no custom oneof means nothing to enforce. | ||
| check_custom_launcher_config_requires_glt_backend( | ||
| gbml_config_pb_wrapper=gbml_config, | ||
| resource_config_wrapper=resource_config, | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| absltest.main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we already have an existing pattern to check for these in
resource_config_checks- lets adopt here.