diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index 81f6ddb0..5cd449fe 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -51,6 +51,7 @@ GCP_API_TIMEOUT = 60 # Timeout for GCP API calls ARTIFACT_DOWNLOAD_TIMEOUT = 300 # 5 minutes for artifact downloads GCP_OPERATION_MAX_WAIT = 1800 # 30 minutes max wait for GCP operations +GCP_VM_CREATE_VERIFY_TIMEOUT = 60 # 60 seconds to verify VM creation started # Retry constants MAX_RETRIES = 3 @@ -925,14 +926,31 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to mark_test_failed(db, test, repository, error_msg) return - # VM creation request was accepted - record the instance optimistically - # We don't wait for the operation to complete because: - # 1. Waiting can take 60+ seconds, blocking gunicorn workers - # 2. If VM creation ultimately fails, the test won't report progress - # and will be cleaned up by the expired instances cron job + # Wait for the VM creation operation to complete (or timeout) + # This catches quota errors and other failures that occur shortly after the + # insert request is accepted. Without this check, tests can get stuck forever + # when VM creation fails but a GcpInstance record is created. op_name = operation.get('name', 'unknown') - log.info(f"Test {test.id}: VM creation initiated (op: {op_name})") + log.info(f"Test {test.id}: VM creation initiated (op: {op_name}), waiting for verification...") + result = wait_for_operation(compute, project_id, zone, op_name, max_wait=GCP_VM_CREATE_VERIFY_TIMEOUT) + + # Check if operation completed with an error (e.g., QUOTA_EXCEEDED) + if 'error' in result: + error_msg = parse_gcp_error(result) + log.error(f"Test {test.id}: VM creation failed: {error_msg}") + log.error(f"Test {test.id}: Full GCP response: {result}") + mark_test_failed(db, test, repository, error_msg) + return + + # Check for timeout - operation still running, which is OK for slow VM creation + if result.get('status') == 'TIMEOUT': + log.warning(f"Test {test.id}: VM creation still in progress after {GCP_VM_CREATE_VERIFY_TIMEOUT}s, " + "recording instance optimistically") + else: + log.info(f"Test {test.id}: VM creation verified successfully") + + # VM creation succeeded (or is still in progress) - record the instance db.add(status) if not safe_db_commit(db, f"recording GCP instance for test {test.id}"): log.error(f"Failed to record GCP instance for test {test.id}, but VM creation was initiated") diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py index 03ff6c72..f28e8398 100644 --- a/tests/test_ci/test_controllers.py +++ b/tests/test_ci/test_controllers.py @@ -222,13 +222,14 @@ def test_cron_job_empty_token(self, mock_log): cron() mock_log.error.assert_called_with('GITHUB_TOKEN not configured, cannot run CI cron') + @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.create_instance') @mock.patch('builtins.open', new_callable=mock.mock_open()) @mock.patch('mod_ci.controllers.g') @mock.patch('mod_ci.controllers.TestProgress') @mock.patch('mod_ci.controllers.GcpInstance') def test_start_test(self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, - mock_create_instance): + mock_create_instance, mock_wait_for_operation): """Test start_test function.""" import zipfile @@ -282,10 +283,9 @@ def extractall(*args, **kwargs): mock_g.db.commit.reset_mock() mock_create_instance.reset_mock() - # Test when gcp create instance is successful (no error in operation response) - # Note: We no longer wait for the operation to complete - we record the instance - # optimistically and let the expired instances cron handle failures + # Test when gcp create instance is successful and verified mock_create_instance.return_value = {'name': 'test-operation-123', 'status': 'RUNNING'} + mock_wait_for_operation.return_value = {'status': 'DONE'} # Success start_test(mock.ANY, self.app, mock_g.db, repository, test, mock.ANY) mock_g.db.commit.assert_called_once() mock_create_instance.assert_called_once() @@ -3807,3 +3807,142 @@ def test_retry_deletion_vm_not_found(self, mock_log, mock_commit, mock_delete): # Should remove the pending record (VM is gone) db.delete.assert_called_once_with(pending) + + +class TestVMCreationVerification(BaseTestCase): + """Tests for VM creation verification to prevent stuck tests. + + These tests verify the fix for the issue where tests would get stuck forever + when VM creation failed (e.g., QUOTA_EXCEEDED) but a GcpInstance record was + still created in the database. + + Root cause investigated: Test #7768 for CCExtractor PR #2014 was stuck in + "Preparation" phase for 12+ hours because: + 1. GCP VM creation failed with QUOTA_EXCEEDED (8 IP addresses limit) + 2. The platform created a gcp_instance DB record before verifying success + 3. The cron job saw the record and assumed the test was running + 4. The test was stuck forever with no way to recover + + The fix: Wait for the GCP operation to complete (or timeout) before creating + the gcp_instance record. If the operation fails, mark the test as failed. + """ + + def _setup_start_test_mocks(self, mock_g, mock_gcp_instance, mock_test_progress, + mock_find_artifact, mock_requests_get, mock_zipfile): + """Set up common mocks for start_test VM creation verification tests.""" + # Mock locking checks to return None (no existing instances/progress) + mock_gcp_instance.query.filter.return_value.first.return_value = None + mock_test_progress.query.filter.return_value.first.return_value = None + mock_query = create_mock_db_query(mock_g) + mock_query.c.got = MagicMock() + + # Mock successful artifact download + mock_artifact = MagicMock() + mock_artifact.name = 'CCExtractor Linux build' + mock_artifact.archive_download_url = 'https://example.com/artifact.zip' + mock_find_artifact.return_value = mock_artifact + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = b'fake zip content' + mock_requests_get.return_value = mock_response + + mock_zip = MagicMock() + mock_zipfile.return_value.__enter__ = MagicMock(return_value=mock_zip) + mock_zipfile.return_value.__exit__ = MagicMock(return_value=False) + + @mock.patch('run.log') + @mock.patch('mod_ci.controllers.mark_test_failed') + @mock.patch('mod_ci.controllers.wait_for_operation') + @mock.patch('mod_ci.controllers.create_instance') + @mock.patch('mod_ci.controllers.find_artifact_for_commit') + @mock.patch('mod_ci.controllers.requests.get') + @mock.patch('mod_ci.controllers.zipfile.ZipFile') + @mock.patch('builtins.open', new_callable=mock.mock_open()) + @mock.patch('mod_ci.controllers.g') + @mock.patch('mod_ci.controllers.TestProgress') + @mock.patch('mod_ci.controllers.GcpInstance') + def test_start_test_quota_exceeded_no_db_record( + self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, + mock_zipfile, mock_requests_get, mock_find_artifact, + mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log): + """Test that QUOTA_EXCEEDED error prevents gcp_instance record creation. + + This is the exact scenario from test #7768: GCP operation returns + successfully but completes with QUOTA_EXCEEDED error. + """ + from mod_ci.controllers import start_test + + self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress, + mock_find_artifact, mock_requests_get, mock_zipfile) + + # VM creation returns operation ID, but wait_for_operation returns error + mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'} + mock_wait_for_operation.return_value = { + 'status': 'DONE', + 'error': {'errors': [{'code': 'QUOTA_EXCEEDED', 'message': 'Quota exceeded'}]}, + 'httpErrorStatusCode': 403 + } + + start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token") + + mock_mark_failed.assert_called_once() + mock_g.db.add.assert_not_called() + + @mock.patch('run.log') + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('mod_ci.controllers.wait_for_operation') + @mock.patch('mod_ci.controllers.create_instance') + @mock.patch('mod_ci.controllers.find_artifact_for_commit') + @mock.patch('mod_ci.controllers.requests.get') + @mock.patch('mod_ci.controllers.zipfile.ZipFile') + @mock.patch('builtins.open', new_callable=mock.mock_open()) + @mock.patch('mod_ci.controllers.g') + @mock.patch('mod_ci.controllers.TestProgress') + @mock.patch('mod_ci.controllers.GcpInstance') + def test_start_test_vm_verified_creates_db_record( + self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, + mock_zipfile, mock_requests_get, mock_find_artifact, + mock_create_instance, mock_wait_for_operation, mock_commit, mock_log): + """Test that successful VM creation creates gcp_instance record.""" + from mod_ci.controllers import start_test + + self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress, + mock_find_artifact, mock_requests_get, mock_zipfile) + + mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'} + mock_wait_for_operation.return_value = {'status': 'DONE'} + mock_commit.return_value = True + + start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token") + + mock_g.db.add.assert_called_once() + + @mock.patch('run.log') + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('mod_ci.controllers.wait_for_operation') + @mock.patch('mod_ci.controllers.create_instance') + @mock.patch('mod_ci.controllers.find_artifact_for_commit') + @mock.patch('mod_ci.controllers.requests.get') + @mock.patch('mod_ci.controllers.zipfile.ZipFile') + @mock.patch('builtins.open', new_callable=mock.mock_open()) + @mock.patch('mod_ci.controllers.g') + @mock.patch('mod_ci.controllers.TestProgress') + @mock.patch('mod_ci.controllers.GcpInstance') + def test_start_test_operation_timeout_creates_db_record( + self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, + mock_zipfile, mock_requests_get, mock_find_artifact, + mock_create_instance, mock_wait_for_operation, mock_commit, mock_log): + """Test that operation timeout still creates record (VM may be starting slowly).""" + from mod_ci.controllers import start_test + + self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress, + mock_find_artifact, mock_requests_get, mock_zipfile) + + mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'} + mock_wait_for_operation.return_value = {'status': 'TIMEOUT'} + mock_commit.return_value = True + + start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token") + + mock_g.db.add.assert_called_once()