Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions core/parser/morphik_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,10 +428,7 @@ def _parse_excel_to_markdown(file: bytes) -> str:
return "\n".join(parts)

async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], str]:
"""Parse video file to extract transcript and frame descriptions"""
if not self._assemblyai_api_key:
raise ValueError("AssemblyAI API key is required for video parsing")

"""Parse video file to extract frame descriptions and, when configured, transcript."""
# Save video to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
temp_file.write(file)
Expand All @@ -452,16 +449,23 @@ async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], str]:
)
results = await parser.process_video()

# Combine frame descriptions and transcript
# Combine frame descriptions and optional transcript
frame_text = "\n".join(results.frame_descriptions.time_to_content.values())
transcript_text = "\n".join(results.transcript.time_to_content.values())
combined_text = f"Frame Descriptions:\n{frame_text}\n\nTranscript:\n{transcript_text}"
text_sections = []
if frame_text:
text_sections.append(f"Frame Descriptions:\n{frame_text}")
if self._assemblyai_api_key:
transcript_text = "\n".join(results.transcript.time_to_content.values())
if transcript_text:
text_sections.append(f"Transcript:\n{transcript_text}")
combined_text = "\n\n".join(text_sections)

metadata = {
"video_metadata": results.metadata,
"frame_timestamps": list(results.frame_descriptions.time_to_content.keys()),
"transcript_timestamps": list(results.transcript.time_to_content.keys()),
}
if self._assemblyai_api_key:
metadata["transcript_timestamps"] = list(results.transcript.time_to_content.keys())

return metadata, combined_text
finally:
Expand Down
77 changes: 57 additions & 20 deletions core/parser/video/parse_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,18 @@ async def get_frame_description(self, image_base64: str, context: str) -> str:


class VideoParser:
def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None):
def __init__(
self,
video_path: str,
assemblyai_api_key: Optional[str] = None,
frame_sample_rate: Optional[int] = None,
):
"""
Initialize the video parser

Args:
video_path: Path to the video file
assemblyai_api_key: API key for AssemblyAI
assemblyai_api_key: Optional API key for AssemblyAI. If omitted, audio transcription is skipped.
frame_sample_rate: Sample every nth frame for description (optional, defaults to config value)
"""
logger.info(f"Initializing VideoParser for {video_path}")
Expand All @@ -112,12 +117,17 @@ def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate:
self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
self.duration = self.total_frames / self.fps

# Initialize AssemblyAI
aai.settings.api_key = assemblyai_api_key
aai_config = aai.TranscriptionConfig(speaker_labels=True)
self.transcriber = aai.Transcriber(config=aai_config)
self.transcript = TimeSeriesData(time_to_content={})

# Initialize AssemblyAI only when audio transcription is configured.
self.transcriber = None
if assemblyai_api_key:
aai.settings.api_key = assemblyai_api_key
aai_config = aai.TranscriptionConfig(speaker_labels=True)
self.transcriber = aai.Transcriber(config=aai_config)
else:
logger.warning("AssemblyAI API key is not available; skipping transcription")

# Initialize vision model client
self.vision_client = VisionModelClient(self.config)

Expand All @@ -135,6 +145,9 @@ def get_transcript_object(self) -> aai.Transcript:
"""
Get the transcript object from AssemblyAI
"""
if self.transcriber is None:
raise ValueError("AssemblyAI API key is required for video transcription")

logger.info("Starting video transcription")
transcript = self.transcriber.transcribe(self.video_path)
if transcript.status == "error":
Expand All @@ -153,6 +166,10 @@ def get_transcript(self) -> TimeSeriesData:
Returns:
TimeSeriesData object containing transcript
"""
if self.transcriber is None:
self.transcript = TimeSeriesData(time_to_content={})
return self.transcript

logger.info("Starting video transcription")
transcript = self.get_transcript_object()
# divide by 1000 because assemblyai timestamps are in milliseconds
Expand Down Expand Up @@ -193,18 +210,38 @@ async def get_frame_descriptions(self) -> TimeSeriesData:

img_base64 = self.frame_to_base64(frame)

context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
---
{self.transcript.at_time(timestamp, padding=10)}
---

Here is a description of the previous frame:
---
{last_description if last_description else 'No previous frame description available, this is the first frame'}
---

In your response, only provide the description of the current frame, using the above information as context.
"""
if last_description:
previous_frame_context = last_description
else:
previous_frame_context = "No previous frame description available, this is the first frame"

description_instruction = (
"Describe this frame from a video. Focus on the main elements, actions, and any notable details."
)
previous_frame_section = (
"Here is a description of the previous frame:\n"
"---\n"
f"{previous_frame_context}\n"
"---"
)
transcript_context = self.transcript.at_time(timestamp, padding=10)
if transcript_context:
context = (
f"{description_instruction} Here is the transcript around the time of the frame:\n"
"---\n"
f"{transcript_context}\n"
"---\n\n"
f"{previous_frame_section}\n\n"
"In your response, only provide the description of the current frame, using the above "
"information as context."
)
else:
context = (
f"{description_instruction}\n\n"
f"{previous_frame_section}\n\n"
"In your response, only provide the description of the current frame, using the above "
"information as context."
)

last_description = await self.vision_client.get_frame_description(img_base64, context)
time_to_description[timestamp] = last_description
Expand All @@ -216,7 +253,7 @@ async def get_frame_descriptions(self) -> TimeSeriesData:

async def process_video(self) -> ParseVideoResult:
"""
Process the video to get both transcript and frame descriptions
Process the video to get frame descriptions and transcript when configured.

Returns:
Dictionary containing transcript and frame descriptions as TimeSeriesData objects
Expand All @@ -230,7 +267,7 @@ async def process_video(self) -> ParseVideoResult:
}
result = ParseVideoResult(
metadata=metadata,
transcript=self.get_transcript(),
transcript=self.get_transcript() if self.transcriber is not None else self.transcript,
frame_descriptions=await self.get_frame_descriptions(),
)
logger.info("Video processing completed successfully")
Expand Down
Loading
Loading