Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions mod_ci/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
GCP_API_TIMEOUT = 60 # Timeout for GCP API calls
ARTIFACT_DOWNLOAD_TIMEOUT = 300 # 5 minutes for artifact downloads
GCP_OPERATION_MAX_WAIT = 1800 # 30 minutes max wait for GCP operations
GCP_VM_CREATE_VERIFY_TIMEOUT = 60 # 60 seconds to verify VM creation started

# Retry constants
MAX_RETRIES = 3
Expand Down Expand Up @@ -925,14 +926,31 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
mark_test_failed(db, test, repository, error_msg)
return

# VM creation request was accepted - record the instance optimistically
# We don't wait for the operation to complete because:
# 1. Waiting can take 60+ seconds, blocking gunicorn workers
# 2. If VM creation ultimately fails, the test won't report progress
# and will be cleaned up by the expired instances cron job
# Wait for the VM creation operation to complete (or timeout)
# This catches quota errors and other failures that occur shortly after the
# insert request is accepted. Without this check, tests can get stuck forever
# when VM creation fails but a GcpInstance record is created.
op_name = operation.get('name', 'unknown')
log.info(f"Test {test.id}: VM creation initiated (op: {op_name})")
log.info(f"Test {test.id}: VM creation initiated (op: {op_name}), waiting for verification...")

result = wait_for_operation(compute, project_id, zone, op_name, max_wait=GCP_VM_CREATE_VERIFY_TIMEOUT)

# Check if operation completed with an error (e.g., QUOTA_EXCEEDED)
if 'error' in result:
error_msg = parse_gcp_error(result)
log.error(f"Test {test.id}: VM creation failed: {error_msg}")
log.error(f"Test {test.id}: Full GCP response: {result}")
mark_test_failed(db, test, repository, error_msg)
return

# Check for timeout - operation still running, which is OK for slow VM creation
if result.get('status') == 'TIMEOUT':
log.warning(f"Test {test.id}: VM creation still in progress after {GCP_VM_CREATE_VERIFY_TIMEOUT}s, "
"recording instance optimistically")
else:
log.info(f"Test {test.id}: VM creation verified successfully")

# VM creation succeeded (or is still in progress) - record the instance
db.add(status)
if not safe_db_commit(db, f"recording GCP instance for test {test.id}"):
log.error(f"Failed to record GCP instance for test {test.id}, but VM creation was initiated")
Expand Down
147 changes: 143 additions & 4 deletions tests/test_ci/test_controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,14 @@ def test_cron_job_empty_token(self, mock_log):
cron()
mock_log.error.assert_called_with('GITHUB_TOKEN not configured, cannot run CI cron')

@mock.patch('mod_ci.controllers.wait_for_operation')
@mock.patch('mod_ci.controllers.create_instance')
@mock.patch('builtins.open', new_callable=mock.mock_open())
@mock.patch('mod_ci.controllers.g')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('mod_ci.controllers.GcpInstance')
def test_start_test(self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
mock_create_instance):
mock_create_instance, mock_wait_for_operation):
"""Test start_test function."""
import zipfile

Expand Down Expand Up @@ -282,10 +283,9 @@ def extractall(*args, **kwargs):
mock_g.db.commit.reset_mock()
mock_create_instance.reset_mock()

# Test when gcp create instance is successful (no error in operation response)
# Note: We no longer wait for the operation to complete - we record the instance
# optimistically and let the expired instances cron handle failures
# Test when gcp create instance is successful and verified
mock_create_instance.return_value = {'name': 'test-operation-123', 'status': 'RUNNING'}
mock_wait_for_operation.return_value = {'status': 'DONE'} # Success
start_test(mock.ANY, self.app, mock_g.db, repository, test, mock.ANY)
mock_g.db.commit.assert_called_once()
mock_create_instance.assert_called_once()
Expand Down Expand Up @@ -3807,3 +3807,142 @@ def test_retry_deletion_vm_not_found(self, mock_log, mock_commit, mock_delete):

# Should remove the pending record (VM is gone)
db.delete.assert_called_once_with(pending)


class TestVMCreationVerification(BaseTestCase):
"""Tests for VM creation verification to prevent stuck tests.

These tests verify the fix for the issue where tests would get stuck forever
when VM creation failed (e.g., QUOTA_EXCEEDED) but a GcpInstance record was
still created in the database.

Root cause investigated: Test #7768 for CCExtractor PR #2014 was stuck in
"Preparation" phase for 12+ hours because:
1. GCP VM creation failed with QUOTA_EXCEEDED (8 IP addresses limit)
2. The platform created a gcp_instance DB record before verifying success
3. The cron job saw the record and assumed the test was running
4. The test was stuck forever with no way to recover

The fix: Wait for the GCP operation to complete (or timeout) before creating
the gcp_instance record. If the operation fails, mark the test as failed.
"""

def _setup_start_test_mocks(self, mock_g, mock_gcp_instance, mock_test_progress,
mock_find_artifact, mock_requests_get, mock_zipfile):
"""Set up common mocks for start_test VM creation verification tests."""
# Mock locking checks to return None (no existing instances/progress)
mock_gcp_instance.query.filter.return_value.first.return_value = None
mock_test_progress.query.filter.return_value.first.return_value = None
mock_query = create_mock_db_query(mock_g)
mock_query.c.got = MagicMock()

# Mock successful artifact download
mock_artifact = MagicMock()
mock_artifact.name = 'CCExtractor Linux build'
mock_artifact.archive_download_url = 'https://example.com/artifact.zip'
mock_find_artifact.return_value = mock_artifact

mock_response = MagicMock()
mock_response.status_code = 200
mock_response.content = b'fake zip content'
mock_requests_get.return_value = mock_response

mock_zip = MagicMock()
mock_zipfile.return_value.__enter__ = MagicMock(return_value=mock_zip)
mock_zipfile.return_value.__exit__ = MagicMock(return_value=False)

@mock.patch('run.log')
@mock.patch('mod_ci.controllers.mark_test_failed')
@mock.patch('mod_ci.controllers.wait_for_operation')
@mock.patch('mod_ci.controllers.create_instance')
@mock.patch('mod_ci.controllers.find_artifact_for_commit')
@mock.patch('mod_ci.controllers.requests.get')
@mock.patch('mod_ci.controllers.zipfile.ZipFile')
@mock.patch('builtins.open', new_callable=mock.mock_open())
@mock.patch('mod_ci.controllers.g')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('mod_ci.controllers.GcpInstance')
def test_start_test_quota_exceeded_no_db_record(
self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
mock_zipfile, mock_requests_get, mock_find_artifact,
mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log):
"""Test that QUOTA_EXCEEDED error prevents gcp_instance record creation.

This is the exact scenario from test #7768: GCP operation returns
successfully but completes with QUOTA_EXCEEDED error.
"""
from mod_ci.controllers import start_test

self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress,
mock_find_artifact, mock_requests_get, mock_zipfile)

# VM creation returns operation ID, but wait_for_operation returns error
mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'}
mock_wait_for_operation.return_value = {
'status': 'DONE',
'error': {'errors': [{'code': 'QUOTA_EXCEEDED', 'message': 'Quota exceeded'}]},
'httpErrorStatusCode': 403
}

start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token")

mock_mark_failed.assert_called_once()
mock_g.db.add.assert_not_called()

@mock.patch('run.log')
@mock.patch('mod_ci.controllers.safe_db_commit')
@mock.patch('mod_ci.controllers.wait_for_operation')
@mock.patch('mod_ci.controllers.create_instance')
@mock.patch('mod_ci.controllers.find_artifact_for_commit')
@mock.patch('mod_ci.controllers.requests.get')
@mock.patch('mod_ci.controllers.zipfile.ZipFile')
@mock.patch('builtins.open', new_callable=mock.mock_open())
@mock.patch('mod_ci.controllers.g')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('mod_ci.controllers.GcpInstance')
def test_start_test_vm_verified_creates_db_record(
self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
mock_zipfile, mock_requests_get, mock_find_artifact,
mock_create_instance, mock_wait_for_operation, mock_commit, mock_log):
"""Test that successful VM creation creates gcp_instance record."""
from mod_ci.controllers import start_test

self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress,
mock_find_artifact, mock_requests_get, mock_zipfile)

mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'}
mock_wait_for_operation.return_value = {'status': 'DONE'}
mock_commit.return_value = True

start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token")

mock_g.db.add.assert_called_once()

@mock.patch('run.log')
@mock.patch('mod_ci.controllers.safe_db_commit')
@mock.patch('mod_ci.controllers.wait_for_operation')
@mock.patch('mod_ci.controllers.create_instance')
@mock.patch('mod_ci.controllers.find_artifact_for_commit')
@mock.patch('mod_ci.controllers.requests.get')
@mock.patch('mod_ci.controllers.zipfile.ZipFile')
@mock.patch('builtins.open', new_callable=mock.mock_open())
@mock.patch('mod_ci.controllers.g')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('mod_ci.controllers.GcpInstance')
def test_start_test_operation_timeout_creates_db_record(
self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
mock_zipfile, mock_requests_get, mock_find_artifact,
mock_create_instance, mock_wait_for_operation, mock_commit, mock_log):
"""Test that operation timeout still creates record (VM may be starting slowly)."""
from mod_ci.controllers import start_test

self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress,
mock_find_artifact, mock_requests_get, mock_zipfile)

mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'}
mock_wait_for_operation.return_value = {'status': 'TIMEOUT'}
mock_commit.return_value = True

start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token")

mock_g.db.add.assert_called_once()