diff --git a/core/parser/morphik_parser.py b/core/parser/morphik_parser.py
index 605c61db..7e781f7e 100644
--- a/core/parser/morphik_parser.py
+++ b/core/parser/morphik_parser.py
@@ -428,10 +428,7 @@ def _parse_excel_to_markdown(file: bytes) -> str:
         return "\n".join(parts)
 
     async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], str]:
-        """Parse video file to extract transcript and frame descriptions"""
-        if not self._assemblyai_api_key:
-            raise ValueError("AssemblyAI API key is required for video parsing")
-
+        """Parse video file to extract frame descriptions and, when configured, transcript."""
         # Save video to temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
             temp_file.write(file)
@@ -452,16 +449,23 @@ async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], str]:
             )
             results = await parser.process_video()
 
-            # Combine frame descriptions and transcript
+            # Combine frame descriptions and optional transcript
             frame_text = "\n".join(results.frame_descriptions.time_to_content.values())
-            transcript_text = "\n".join(results.transcript.time_to_content.values())
-            combined_text = f"Frame Descriptions:\n{frame_text}\n\nTranscript:\n{transcript_text}"
+            text_sections = []
+            if frame_text:
+                text_sections.append(f"Frame Descriptions:\n{frame_text}")
+            if self._assemblyai_api_key:
+                transcript_text = "\n".join(results.transcript.time_to_content.values())
+                if transcript_text:
+                    text_sections.append(f"Transcript:\n{transcript_text}")
+            combined_text = "\n\n".join(text_sections)
 
             metadata = {
                 "video_metadata": results.metadata,
                 "frame_timestamps": list(results.frame_descriptions.time_to_content.keys()),
-                "transcript_timestamps": list(results.transcript.time_to_content.keys()),
             }
+            if self._assemblyai_api_key:
+                metadata["transcript_timestamps"] = list(results.transcript.time_to_content.keys())
 
             return metadata, combined_text
         finally:
diff --git a/core/parser/video/parse_video.py b/core/parser/video/parse_video.py
index e27ef012..ab9166bc 100644
--- a/core/parser/video/parse_video.py
+++ b/core/parser/video/parse_video.py
@@ -89,13 +89,18 @@ async def get_frame_description(self, image_base64: str, context: str) -> str:
 
 
 class VideoParser:
-    def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None):
+    def __init__(
+        self,
+        video_path: str,
+        assemblyai_api_key: Optional[str] = None,
+        frame_sample_rate: Optional[int] = None,
+    ):
         """
         Initialize the video parser
 
         Args:
             video_path: Path to the video file
-            assemblyai_api_key: API key for AssemblyAI
+            assemblyai_api_key: Optional API key for AssemblyAI. If omitted, audio transcription is skipped.
             frame_sample_rate: Sample every nth frame for description (optional, defaults to config value)
         """
         logger.info(f"Initializing VideoParser for {video_path}")
@@ -112,12 +117,17 @@ def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate:
         self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
         self.duration = self.total_frames / self.fps
 
-        # Initialize AssemblyAI
-        aai.settings.api_key = assemblyai_api_key
-        aai_config = aai.TranscriptionConfig(speaker_labels=True)
-        self.transcriber = aai.Transcriber(config=aai_config)
         self.transcript = TimeSeriesData(time_to_content={})
 
+        # Initialize AssemblyAI only when audio transcription is configured.
+        self.transcriber = None
+        if assemblyai_api_key:
+            aai.settings.api_key = assemblyai_api_key
+            aai_config = aai.TranscriptionConfig(speaker_labels=True)
+            self.transcriber = aai.Transcriber(config=aai_config)
+        else:
+            logger.warning("AssemblyAI API key is not available; skipping transcription")
+
         # Initialize vision model client
         self.vision_client = VisionModelClient(self.config)
 
@@ -135,6 +145,9 @@ def get_transcript_object(self) -> aai.Transcript:
         """
         Get the transcript object from AssemblyAI
         """
+        if self.transcriber is None:
+            raise ValueError("AssemblyAI API key is required for video transcription")
+
         logger.info("Starting video transcription")
         transcript = self.transcriber.transcribe(self.video_path)
         if transcript.status == "error":
@@ -153,6 +166,10 @@ def get_transcript(self) -> TimeSeriesData:
         Returns:
             TimeSeriesData object containing transcript
         """
+        if self.transcriber is None:
+            self.transcript = TimeSeriesData(time_to_content={})
+            return self.transcript
+
         logger.info("Starting video transcription")
         transcript = self.get_transcript_object()
         # divide by 1000 because assemblyai timestamps are in milliseconds
@@ -193,18 +210,38 @@ async def get_frame_descriptions(self) -> TimeSeriesData:
 
                 img_base64 = self.frame_to_base64(frame)
 
-                context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
-                ---
-                {self.transcript.at_time(timestamp, padding=10)}
-                ---
-
-                Here is a description of the previous frame:
-                ---
-                {last_description if last_description else 'No previous frame description available, this is the first frame'}
-                ---
-
-                In your response, only provide the description of the current frame, using the above information as context.
-                """
+                if last_description:
+                    previous_frame_context = last_description
+                else:
+                    previous_frame_context = "No previous frame description available, this is the first frame"
+
+                description_instruction = (
+                    "Describe this frame from a video. Focus on the main elements, actions, and any notable details."
+                )
+                previous_frame_section = (
+                    "Here is a description of the previous frame:\n"
+                    "---\n"
+                    f"{previous_frame_context}\n"
+                    "---"
+                )
+                transcript_context = self.transcript.at_time(timestamp, padding=10)
+                if transcript_context:
+                    context = (
+                        f"{description_instruction} Here is the transcript around the time of the frame:\n"
+                        "---\n"
+                        f"{transcript_context}\n"
+                        "---\n\n"
+                        f"{previous_frame_section}\n\n"
+                        "In your response, only provide the description of the current frame, using the above "
+                        "information as context."
+                    )
+                else:
+                    context = (
+                        f"{description_instruction}\n\n"
+                        f"{previous_frame_section}\n\n"
+                        "In your response, only provide the description of the current frame, using the above "
+                        "information as context."
+                    )
 
                 last_description = await self.vision_client.get_frame_description(img_base64, context)
                 time_to_description[timestamp] = last_description
@@ -216,7 +253,7 @@ async def get_frame_descriptions(self) -> TimeSeriesData:
 
     async def process_video(self) -> ParseVideoResult:
         """
-        Process the video to get both transcript and frame descriptions
+        Process the video to get frame descriptions and transcript when configured.
 
         Returns:
             Dictionary containing transcript and frame descriptions as TimeSeriesData objects
@@ -230,7 +267,7 @@ async def process_video(self) -> ParseVideoResult:
         }
         result = ParseVideoResult(
             metadata=metadata,
-            transcript=self.get_transcript(),
+            transcript=self.get_transcript() if self.transcriber is not None else self.transcript,
             frame_descriptions=await self.get_frame_descriptions(),
         )
         logger.info("Video processing completed successfully")
diff --git a/core/tests/unit/test_video_parser.py b/core/tests/unit/test_video_parser.py
new file mode 100644
index 00000000..fae8079d
--- /dev/null
+++ b/core/tests/unit/test_video_parser.py
@@ -0,0 +1,246 @@
+import logging
+import sys
+import types
+
+import pytest
+
+
+MODULES_UNDER_TEST = [
+    "core.models.video",
+    "core.parser.video.parse_video",
+    "core.parser.morphik_parser",
+]
+
+
+def _drop_module(name):
+    sys.modules.pop(name, None)
+    if "." not in name:
+        return
+
+    parent_name, attr_name = name.rsplit(".", 1)
+    parent = sys.modules.get(parent_name)
+    if parent is not None and hasattr(parent, attr_name):
+        delattr(parent, attr_name)
+
+
+def _stub_module(monkeypatch, name):
+    module = types.ModuleType(name)
+    module.__path__ = []
+    monkeypatch.setitem(sys.modules, name, module)
+
+    if "." in name:
+        parent_name, attr_name = name.rsplit(".", 1)
+        parent = sys.modules.get(parent_name)
+        if parent is not None:
+            monkeypatch.setattr(parent, attr_name, module, raising=False)
+
+    return module
+
+
+def _install_dependency_stubs(monkeypatch):
+    _stub_module(monkeypatch, "docling")
+    _stub_module(monkeypatch, "docling.datamodel")
+    docling_base_models = _stub_module(monkeypatch, "docling.datamodel.base_models")
+    docling_base_models.InputFormat = types.SimpleNamespace(PDF="pdf")
+
+    docling_pipeline_options = _stub_module(monkeypatch, "docling.datamodel.pipeline_options")
+    docling_pipeline_options.PdfPipelineOptions = type("PdfPipelineOptions", (), {})
+    docling_pipeline_options.EasyOcrOptions = type("EasyOcrOptions", (), {})
+    docling_pipeline_options.TableStructureOptions = type("TableStructureOptions", (), {})
+
+    docling_document_converter = _stub_module(monkeypatch, "docling.document_converter")
+    docling_document_converter.DocumentConverter = type(
+        "DocumentConverter",
+        (),
+        {"__init__": lambda self, *a, **kw: None},
+    )
+    docling_document_converter.PdfFormatOption = type(
+        "PdfFormatOption",
+        (),
+        {"__init__": lambda self, *a, **kw: None},
+    )
+
+    assemblyai = _stub_module(monkeypatch, "assemblyai")
+    assemblyai.settings = types.SimpleNamespace(api_key=None)
+    assemblyai.Transcript = type("Transcript", (), {})
+    assemblyai.TranscriptionConfig = type(
+        "TranscriptionConfig",
+        (),
+        {"__init__": lambda self, *a, **kw: None},
+    )
+    assemblyai.Transcriber = type(
+        "Transcriber",
+        (),
+        {"__init__": lambda self, *a, **kw: None},
+    )
+
+    cv2 = _stub_module(monkeypatch, "cv2")
+    cv2.CAP_PROP_FPS = 0
+    cv2.CAP_PROP_FRAME_COUNT = 1
+    cv2.VideoCapture = lambda path: None
+
+    _stub_module(monkeypatch, "litellm")
+    _stub_module(monkeypatch, "openpyxl")
+
+    filetype = _stub_module(monkeypatch, "filetype")
+    filetype.guess = lambda content: None
+
+
+@pytest.fixture
+def video_modules(monkeypatch):
+    for module_name in MODULES_UNDER_TEST:
+        _drop_module(module_name)
+
+    _install_dependency_stubs(monkeypatch)
+
+    from core.models.video import ParseVideoResult, TimeSeriesData
+    from core.parser import morphik_parser as morphik_parser_module
+    from core.parser.video import parse_video as parse_video_module
+    from core.parser.morphik_parser import MorphikParser
+    from core.parser.video.parse_video import VideoParser
+
+    yield types.SimpleNamespace(
+        MorphikParser=MorphikParser,
+        ParseVideoResult=ParseVideoResult,
+        TimeSeriesData=TimeSeriesData,
+        VideoParser=VideoParser,
+        morphik_parser_module=morphik_parser_module,
+        parse_video_module=parse_video_module,
+    )
+
+    for module_name in MODULES_UNDER_TEST:
+        _drop_module(module_name)
+
+
+def _fake_video_parser_class(ParseVideoResult, TimeSeriesData):
+    class _FakeVideoParser:
+        instances = []
+
+        def __init__(self, video_path, assemblyai_api_key=None, frame_sample_rate=None):
+            self.video_path = video_path
+            self.assemblyai_api_key = assemblyai_api_key
+            self.frame_sample_rate = frame_sample_rate
+            self.instances.append(self)
+
+        async def process_video(self):
+            return ParseVideoResult(
+                metadata={
+                    "duration": 1.0,
+                    "fps": 1.0,
+                    "total_frames": 1,
+                    "frame_sample_rate": self.frame_sample_rate,
+                },
+                frame_descriptions=TimeSeriesData(time_to_content={0.0: "visible frame"}),
+                transcript=TimeSeriesData(time_to_content={0.5: "spoken words"}),
+            )
+
+    return _FakeVideoParser
+
+
+@pytest.mark.asyncio
+async def test_parse_video_skips_transcript_without_assemblyai_key(monkeypatch, video_modules):
+    fake_video_parser = _fake_video_parser_class(video_modules.ParseVideoResult, video_modules.TimeSeriesData)
+    monkeypatch.setattr(video_modules.morphik_parser_module, "VideoParser", fake_video_parser)
+    monkeypatch.setattr(
+        video_modules.morphik_parser_module,
+        "load_config",
+        lambda: {"parser": {"vision": {"frame_sample_rate": 5}}},
+    )
+    parser = object.__new__(video_modules.MorphikParser)
+    parser._assemblyai_api_key = None
+    parser.frame_sample_rate = 1
+
+    metadata, text = await parser._parse_video(b"video bytes")
+
+    assert fake_video_parser.instances[0].assemblyai_api_key is None
+    assert text == "Frame Descriptions:\nvisible frame"
+    assert metadata["frame_timestamps"] == [0.0]
+    assert "transcript_timestamps" not in metadata
+    assert "Transcript" not in text
+
+
+@pytest.mark.asyncio
+async def test_parse_video_includes_transcript_when_assemblyai_key_is_configured(monkeypatch, video_modules):
+    fake_video_parser = _fake_video_parser_class(video_modules.ParseVideoResult, video_modules.TimeSeriesData)
+    monkeypatch.setattr(video_modules.morphik_parser_module, "VideoParser", fake_video_parser)
+    monkeypatch.setattr(
+        video_modules.morphik_parser_module,
+        "load_config",
+        lambda: {"parser": {"vision": {"frame_sample_rate": 5}}},
+    )
+    parser = object.__new__(video_modules.MorphikParser)
+    parser._assemblyai_api_key = "assembly-key"
+    parser.frame_sample_rate = 1
+
+    metadata, text = await parser._parse_video(b"video bytes")
+
+    assert fake_video_parser.instances[0].assemblyai_api_key == "assembly-key"
+    assert text == "Frame Descriptions:\nvisible frame\n\nTranscript:\nspoken words"
+    assert metadata["frame_timestamps"] == [0.0]
+    assert metadata["transcript_timestamps"] == [0.5]
+
+
+class _FakeCapture:
+    def __init__(self):
+        self.calls = 0
+
+    def read(self):
+        self.calls += 1
+        if self.calls == 1:
+            return True, object()
+        return False, None
+
+    def release(self):
+        pass
+
+    def isOpened(self):
+        return True
+
+    def get(self, property_id):
+        if property_id == 0:
+            return 1.0
+        if property_id == 1:
+            return 1
+        return 0
+
+
+class _FakeVisionClient:
+    def __init__(self):
+        self.contexts = []
+
+    async def get_frame_description(self, image_base64, context):
+        self.contexts.append(context)
+        return "visible frame"
+
+
+class _FakeVisionClientClass:
+    def __init__(self, config):
+        self.config = config
+
+
+def test_video_parser_warns_when_assemblyai_key_is_missing(monkeypatch, caplog, video_modules):
+    monkeypatch.setattr(video_modules.parse_video_module.cv2, "VideoCapture", lambda path: _FakeCapture())
+    monkeypatch.setattr(video_modules.parse_video_module, "VisionModelClient", _FakeVisionClientClass)
+
+    with caplog.at_level(logging.WARNING, logger="core.parser.video.parse_video"):
+        parser = video_modules.VideoParser("/tmp/video.mp4", assemblyai_api_key=None, frame_sample_rate=1)
+
+    parser.cap.release()
+    assert "AssemblyAI API key is not available; skipping transcription" in caplog.text
+
+
+@pytest.mark.asyncio
+async def test_frame_descriptions_do_not_mention_transcripts_when_transcript_is_empty(video_modules):
+    parser = object.__new__(video_modules.VideoParser)
+    parser.cap = _FakeCapture()
+    parser.fps = 1.0
+    parser.frame_sample_rate = 1
+    parser.transcript = video_modules.TimeSeriesData(time_to_content={})
+    parser.vision_client = _FakeVisionClient()
+    parser.frame_to_base64 = lambda frame: "image"
+
+    result = await parser.get_frame_descriptions()
+
+    assert result.time_to_content == {0.0: "visible frame"}
+    assert len(parser.vision_client.contexts) == 1
+    assert "transcript" not in parser.vision_client.contexts[0].lower()