diff --git a/core/parser/morphik_parser.py b/core/parser/morphik_parser.py index 605c61db..7e781f7e 100644 --- a/core/parser/morphik_parser.py +++ b/core/parser/morphik_parser.py @@ -428,10 +428,7 @@ def _parse_excel_to_markdown(file: bytes) -> str: return "\n".join(parts) async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], str]: - """Parse video file to extract transcript and frame descriptions""" - if not self._assemblyai_api_key: - raise ValueError("AssemblyAI API key is required for video parsing") - + """Parse video file to extract frame descriptions and, when configured, transcript.""" # Save video to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file: temp_file.write(file) @@ -452,16 +449,23 @@ async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], str]: ) results = await parser.process_video() - # Combine frame descriptions and transcript + # Combine frame descriptions and optional transcript frame_text = "\n".join(results.frame_descriptions.time_to_content.values()) - transcript_text = "\n".join(results.transcript.time_to_content.values()) - combined_text = f"Frame Descriptions:\n{frame_text}\n\nTranscript:\n{transcript_text}" + text_sections = [] + if frame_text: + text_sections.append(f"Frame Descriptions:\n{frame_text}") + if self._assemblyai_api_key: + transcript_text = "\n".join(results.transcript.time_to_content.values()) + if transcript_text: + text_sections.append(f"Transcript:\n{transcript_text}") + combined_text = "\n\n".join(text_sections) metadata = { "video_metadata": results.metadata, "frame_timestamps": list(results.frame_descriptions.time_to_content.keys()), - "transcript_timestamps": list(results.transcript.time_to_content.keys()), } + if self._assemblyai_api_key: + metadata["transcript_timestamps"] = list(results.transcript.time_to_content.keys()) return metadata, combined_text finally: diff --git a/core/parser/video/parse_video.py b/core/parser/video/parse_video.py index e27ef012..ab9166bc 100644 --- a/core/parser/video/parse_video.py +++ b/core/parser/video/parse_video.py @@ -89,13 +89,18 @@ async def get_frame_description(self, image_base64: str, context: str) -> str: class VideoParser: - def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None): + def __init__( + self, + video_path: str, + assemblyai_api_key: Optional[str] = None, + frame_sample_rate: Optional[int] = None, + ): """ Initialize the video parser Args: video_path: Path to the video file - assemblyai_api_key: API key for AssemblyAI + assemblyai_api_key: Optional API key for AssemblyAI. If omitted, audio transcription is skipped. frame_sample_rate: Sample every nth frame for description (optional, defaults to config value) """ logger.info(f"Initializing VideoParser for {video_path}") @@ -112,12 +117,17 @@ def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) self.duration = self.total_frames / self.fps - # Initialize AssemblyAI - aai.settings.api_key = assemblyai_api_key - aai_config = aai.TranscriptionConfig(speaker_labels=True) - self.transcriber = aai.Transcriber(config=aai_config) self.transcript = TimeSeriesData(time_to_content={}) + # Initialize AssemblyAI only when audio transcription is configured. + self.transcriber = None + if assemblyai_api_key: + aai.settings.api_key = assemblyai_api_key + aai_config = aai.TranscriptionConfig(speaker_labels=True) + self.transcriber = aai.Transcriber(config=aai_config) + else: + logger.warning("AssemblyAI API key is not available; skipping transcription") + # Initialize vision model client self.vision_client = VisionModelClient(self.config) @@ -135,6 +145,9 @@ def get_transcript_object(self) -> aai.Transcript: """ Get the transcript object from AssemblyAI """ + if self.transcriber is None: + raise ValueError("AssemblyAI API key is required for video transcription") + logger.info("Starting video transcription") transcript = self.transcriber.transcribe(self.video_path) if transcript.status == "error": @@ -153,6 +166,10 @@ def get_transcript(self) -> TimeSeriesData: Returns: TimeSeriesData object containing transcript """ + if self.transcriber is None: + self.transcript = TimeSeriesData(time_to_content={}) + return self.transcript + logger.info("Starting video transcription") transcript = self.get_transcript_object() # divide by 1000 because assemblyai timestamps are in milliseconds @@ -193,18 +210,38 @@ async def get_frame_descriptions(self) -> TimeSeriesData: img_base64 = self.frame_to_base64(frame) - context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame: - --- - {self.transcript.at_time(timestamp, padding=10)} - --- - - Here is a description of the previous frame: - --- - {last_description if last_description else 'No previous frame description available, this is the first frame'} - --- - - In your response, only provide the description of the current frame, using the above information as context. - """ + if last_description: + previous_frame_context = last_description + else: + previous_frame_context = "No previous frame description available, this is the first frame" + + description_instruction = ( + "Describe this frame from a video. Focus on the main elements, actions, and any notable details." + ) + previous_frame_section = ( + "Here is a description of the previous frame:\n" + "---\n" + f"{previous_frame_context}\n" + "---" + ) + transcript_context = self.transcript.at_time(timestamp, padding=10) + if transcript_context: + context = ( + f"{description_instruction} Here is the transcript around the time of the frame:\n" + "---\n" + f"{transcript_context}\n" + "---\n\n" + f"{previous_frame_section}\n\n" + "In your response, only provide the description of the current frame, using the above " + "information as context." + ) + else: + context = ( + f"{description_instruction}\n\n" + f"{previous_frame_section}\n\n" + "In your response, only provide the description of the current frame, using the above " + "information as context." + ) last_description = await self.vision_client.get_frame_description(img_base64, context) time_to_description[timestamp] = last_description @@ -216,7 +253,7 @@ async def get_frame_descriptions(self) -> TimeSeriesData: async def process_video(self) -> ParseVideoResult: """ - Process the video to get both transcript and frame descriptions + Process the video to get frame descriptions and transcript when configured. Returns: Dictionary containing transcript and frame descriptions as TimeSeriesData objects @@ -230,7 +267,7 @@ async def process_video(self) -> ParseVideoResult: } result = ParseVideoResult( metadata=metadata, - transcript=self.get_transcript(), + transcript=self.get_transcript() if self.transcriber is not None else self.transcript, frame_descriptions=await self.get_frame_descriptions(), ) logger.info("Video processing completed successfully") diff --git a/core/tests/unit/test_video_parser.py b/core/tests/unit/test_video_parser.py new file mode 100644 index 00000000..fae8079d --- /dev/null +++ b/core/tests/unit/test_video_parser.py @@ -0,0 +1,246 @@ +import logging +import sys +import types + +import pytest + + +MODULES_UNDER_TEST = [ + "core.models.video", + "core.parser.video.parse_video", + "core.parser.morphik_parser", +] + + +def _drop_module(name): + sys.modules.pop(name, None) + if "." not in name: + return + + parent_name, attr_name = name.rsplit(".", 1) + parent = sys.modules.get(parent_name) + if parent is not None and hasattr(parent, attr_name): + delattr(parent, attr_name) + + +def _stub_module(monkeypatch, name): + module = types.ModuleType(name) + module.__path__ = [] + monkeypatch.setitem(sys.modules, name, module) + + if "." in name: + parent_name, attr_name = name.rsplit(".", 1) + parent = sys.modules.get(parent_name) + if parent is not None: + monkeypatch.setattr(parent, attr_name, module, raising=False) + + return module + + +def _install_dependency_stubs(monkeypatch): + _stub_module(monkeypatch, "docling") + _stub_module(monkeypatch, "docling.datamodel") + docling_base_models = _stub_module(monkeypatch, "docling.datamodel.base_models") + docling_base_models.InputFormat = types.SimpleNamespace(PDF="pdf") + + docling_pipeline_options = _stub_module(monkeypatch, "docling.datamodel.pipeline_options") + docling_pipeline_options.PdfPipelineOptions = type("PdfPipelineOptions", (), {}) + docling_pipeline_options.EasyOcrOptions = type("EasyOcrOptions", (), {}) + docling_pipeline_options.TableStructureOptions = type("TableStructureOptions", (), {}) + + docling_document_converter = _stub_module(monkeypatch, "docling.document_converter") + docling_document_converter.DocumentConverter = type( + "DocumentConverter", + (), + {"__init__": lambda self, *a, **kw: None}, + ) + docling_document_converter.PdfFormatOption = type( + "PdfFormatOption", + (), + {"__init__": lambda self, *a, **kw: None}, + ) + + assemblyai = _stub_module(monkeypatch, "assemblyai") + assemblyai.settings = types.SimpleNamespace(api_key=None) + assemblyai.Transcript = type("Transcript", (), {}) + assemblyai.TranscriptionConfig = type( + "TranscriptionConfig", + (), + {"__init__": lambda self, *a, **kw: None}, + ) + assemblyai.Transcriber = type( + "Transcriber", + (), + {"__init__": lambda self, *a, **kw: None}, + ) + + cv2 = _stub_module(monkeypatch, "cv2") + cv2.CAP_PROP_FPS = 0 + cv2.CAP_PROP_FRAME_COUNT = 1 + cv2.VideoCapture = lambda path: None + + _stub_module(monkeypatch, "litellm") + _stub_module(monkeypatch, "openpyxl") + + filetype = _stub_module(monkeypatch, "filetype") + filetype.guess = lambda content: None + + +@pytest.fixture +def video_modules(monkeypatch): + for module_name in MODULES_UNDER_TEST: + _drop_module(module_name) + + _install_dependency_stubs(monkeypatch) + + from core.models.video import ParseVideoResult, TimeSeriesData + from core.parser import morphik_parser as morphik_parser_module + from core.parser.video import parse_video as parse_video_module + from core.parser.morphik_parser import MorphikParser + from core.parser.video.parse_video import VideoParser + + yield types.SimpleNamespace( + MorphikParser=MorphikParser, + ParseVideoResult=ParseVideoResult, + TimeSeriesData=TimeSeriesData, + VideoParser=VideoParser, + morphik_parser_module=morphik_parser_module, + parse_video_module=parse_video_module, + ) + + for module_name in MODULES_UNDER_TEST: + _drop_module(module_name) + + +def _fake_video_parser_class(ParseVideoResult, TimeSeriesData): + class _FakeVideoParser: + instances = [] + + def __init__(self, video_path, assemblyai_api_key=None, frame_sample_rate=None): + self.video_path = video_path + self.assemblyai_api_key = assemblyai_api_key + self.frame_sample_rate = frame_sample_rate + self.instances.append(self) + + async def process_video(self): + return ParseVideoResult( + metadata={ + "duration": 1.0, + "fps": 1.0, + "total_frames": 1, + "frame_sample_rate": self.frame_sample_rate, + }, + frame_descriptions=TimeSeriesData(time_to_content={0.0: "visible frame"}), + transcript=TimeSeriesData(time_to_content={0.5: "spoken words"}), + ) + + return _FakeVideoParser + + +@pytest.mark.asyncio +async def test_parse_video_skips_transcript_without_assemblyai_key(monkeypatch, video_modules): + fake_video_parser = _fake_video_parser_class(video_modules.ParseVideoResult, video_modules.TimeSeriesData) + monkeypatch.setattr(video_modules.morphik_parser_module, "VideoParser", fake_video_parser) + monkeypatch.setattr( + video_modules.morphik_parser_module, + "load_config", + lambda: {"parser": {"vision": {"frame_sample_rate": 5}}}, + ) + parser = object.__new__(video_modules.MorphikParser) + parser._assemblyai_api_key = None + parser.frame_sample_rate = 1 + + metadata, text = await parser._parse_video(b"video bytes") + + assert fake_video_parser.instances[0].assemblyai_api_key is None + assert text == "Frame Descriptions:\nvisible frame" + assert metadata["frame_timestamps"] == [0.0] + assert "transcript_timestamps" not in metadata + assert "Transcript" not in text + + +@pytest.mark.asyncio +async def test_parse_video_includes_transcript_when_assemblyai_key_is_configured(monkeypatch, video_modules): + fake_video_parser = _fake_video_parser_class(video_modules.ParseVideoResult, video_modules.TimeSeriesData) + monkeypatch.setattr(video_modules.morphik_parser_module, "VideoParser", fake_video_parser) + monkeypatch.setattr( + video_modules.morphik_parser_module, + "load_config", + lambda: {"parser": {"vision": {"frame_sample_rate": 5}}}, + ) + parser = object.__new__(video_modules.MorphikParser) + parser._assemblyai_api_key = "assembly-key" + parser.frame_sample_rate = 1 + + metadata, text = await parser._parse_video(b"video bytes") + + assert fake_video_parser.instances[0].assemblyai_api_key == "assembly-key" + assert text == "Frame Descriptions:\nvisible frame\n\nTranscript:\nspoken words" + assert metadata["frame_timestamps"] == [0.0] + assert metadata["transcript_timestamps"] == [0.5] + + +class _FakeCapture: + def __init__(self): + self.calls = 0 + + def read(self): + self.calls += 1 + if self.calls == 1: + return True, object() + return False, None + + def release(self): + pass + + def isOpened(self): + return True + + def get(self, property_id): + if property_id == 0: + return 1.0 + if property_id == 1: + return 1 + return 0 + + +class _FakeVisionClient: + def __init__(self): + self.contexts = [] + + async def get_frame_description(self, image_base64, context): + self.contexts.append(context) + return "visible frame" + + +class _FakeVisionClientClass: + def __init__(self, config): + self.config = config + + +def test_video_parser_warns_when_assemblyai_key_is_missing(monkeypatch, caplog, video_modules): + monkeypatch.setattr(video_modules.parse_video_module.cv2, "VideoCapture", lambda path: _FakeCapture()) + monkeypatch.setattr(video_modules.parse_video_module, "VisionModelClient", _FakeVisionClientClass) + + with caplog.at_level(logging.WARNING, logger="core.parser.video.parse_video"): + parser = video_modules.VideoParser("/tmp/video.mp4", assemblyai_api_key=None, frame_sample_rate=1) + + parser.cap.release() + assert "AssemblyAI API key is not available; skipping transcription" in caplog.text + + +@pytest.mark.asyncio +async def test_frame_descriptions_do_not_mention_transcripts_when_transcript_is_empty(video_modules): + parser = object.__new__(video_modules.VideoParser) + parser.cap = _FakeCapture() + parser.fps = 1.0 + parser.frame_sample_rate = 1 + parser.transcript = video_modules.TimeSeriesData(time_to_content={}) + parser.vision_client = _FakeVisionClient() + parser.frame_to_base64 = lambda frame: "image" + + result = await parser.get_frame_descriptions() + + assert result.time_to_content == {0.0: "visible frame"} + assert len(parser.vision_client.contexts) == 1 + assert "transcript" not in parser.vision_client.contexts[0].lower()