From bdcec69edf1bc96b1eaacc031af12d2b81646522 Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Wed, 6 May 2026 12:41:16 -0700 Subject: [PATCH 1/9] inital version --- README.md | 60 +++ packages/markitdown/pyproject.toml | 2 + .../markitdown/src/markitdown/__main__.py | 55 ++ .../markitdown/src/markitdown/_markitdown.py | 23 + .../src/markitdown/converters/__init__.py | 6 + .../markitdown/converters/_cu_converter.py | 485 ++++++++++++++++++ .../markitdown/tests/test_cu_converter.py | 329 ++++++++++++ 7 files changed, 960 insertions(+) create mode 100644 packages/markitdown/src/markitdown/converters/_cu_converter.py create mode 100644 packages/markitdown/tests/test_cu_converter.py diff --git a/README.md b/README.md index 71bcaa204..7d4936b36 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,66 @@ markitdown path-to-file.pdf -o document.md -d -e "" +``` + +**Python API:** + +```python +from markitdown import MarkItDown + +# Zero-config — auto-selects analyzer per file type +md = MarkItDown(cu_endpoint="") +result = md.convert("report.pdf") # documents → prebuilt-documentSearch +result = md.convert("meeting.mp4") # video → prebuilt-videoSearch +result = md.convert("call.wav") # audio → prebuilt-audioSearch +print(result.markdown) +``` + +**With a custom analyzer** (for domain-specific field extraction): + +```python +md = MarkItDown( + cu_endpoint="", + cu_analyzer_id="my-invoice-analyzer", +) +result = md.convert("invoice.pdf") +print(result.markdown) +# Output includes YAML front matter with extracted fields: +# --- +# contentType: document +# fields: +# VendorName: CONTOSO LTD. +# InvoiceDate: '2019-11-15' +# --- +# +# ... +``` + +When `cu_analyzer_id` is set, the converter automatically scopes it to compatible file types based on the analyzer's modality. Incompatible types (e.g., audio files with a document analyzer) auto-route to default prebuilt analyzers. + +**Cost note:** Each `convert()` call for a CU-routed format is a billable Azure API call. Use `cu_file_types` to restrict which formats route to CU: + +```python +from markitdown.converters import ContentUnderstandingFileType + +md = MarkItDown( + cu_endpoint="", + cu_file_types=[ContentUnderstandingFileType.PDF], # only PDFs use CU +) +``` + +More information about Azure Content Understanding can be found [here](https://learn.microsoft.com/azure/ai-services/content-understanding/). + ### Python API Basic usage in Python: diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index ac3c8d947..84841cd03 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -47,6 +47,7 @@ all = [ "SpeechRecognition", "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", + "azure-ai-contentunderstanding", "azure-identity", ] pptx = ["python-pptx"] @@ -58,6 +59,7 @@ outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] +az-content-understanding = ["azure-ai-contentunderstanding", "azure-identity"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..d57b2ae65 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -4,6 +4,7 @@ import argparse import sys import codecs +from typing import Any, Dict from textwrap import dedent from importlib.metadata import entry_points from .__about__ import __version__ @@ -91,6 +92,32 @@ def main(): help="Document Intelligence Endpoint. Required if using Document Intelligence.", ) + parser.add_argument( + "--use-cu", + "--use-content-understanding", + action="store_true", + dest="use_cu", + help="Use Azure Content Understanding to extract text. Requires --cu-endpoint.", + ) + + parser.add_argument( + "--cu-endpoint", + type=str, + help="Content Understanding Endpoint. Required if using --use-cu.", + ) + + parser.add_argument( + "--cu-analyzer", + type=str, + help="Content Understanding analyzer ID. If not specified, auto-selects by file type.", + ) + + parser.add_argument( + "--cu-file-types", + type=str, + help="Comma-separated list of file types to route to Content Understanding (e.g., pdf,jpeg,mp4). If omitted, all supported types are routed.", + ) + parser.add_argument( "-p", "--use-plugins", @@ -183,6 +210,34 @@ def main(): markitdown = MarkItDown( enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint ) + elif args.use_cu: + if args.cu_endpoint is None: + _exit_with_error( + "Content Understanding Endpoint (--cu-endpoint) is required when using --use-cu." + ) + elif args.filename is None: + _exit_with_error("Filename is required when using Content Understanding.") + + cu_kwargs: Dict[str, Any] = { + "cu_endpoint": args.cu_endpoint, + } + if args.cu_analyzer is not None: + cu_kwargs["cu_analyzer_id"] = args.cu_analyzer + if args.cu_file_types is not None: + # Parse comma-separated file types into ContentUnderstandingFileType list + from .converters import ContentUnderstandingFileType + + type_names = [t.strip().lower() for t in args.cu_file_types.split(",") if t.strip()] + cu_types = [] + for name in type_names: + # Try matching by value (e.g., "pdf", "jpeg", "mp4") + try: + cu_types.append(ContentUnderstandingFileType(name)) + except ValueError: + _exit_with_error(f"Unknown file type: {name}") + cu_kwargs["cu_file_types"] = cu_types + + markitdown = MarkItDown(enable_plugins=args.use_plugins, **cu_kwargs) else: markitdown = MarkItDown(enable_plugins=args.use_plugins) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..f6aa4df0e 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -38,6 +38,7 @@ ZipConverter, EpubConverter, DocumentIntelligenceConverter, + ContentUnderstandingConverter, CsvConverter, ) @@ -225,6 +226,28 @@ def enable_builtins(self, **kwargs) -> None: DocumentIntelligenceConverter(**docintel_args), ) + # Register Content Understanding converter at the top of the stack if endpoint is provided + cu_endpoint = kwargs.get("cu_endpoint") + if cu_endpoint is not None: + cu_args: Dict[str, Any] = {} + cu_args["endpoint"] = cu_endpoint + + cu_credential = kwargs.get("cu_credential") + if cu_credential is not None: + cu_args["credential"] = cu_credential + + cu_analyzer_id = kwargs.get("cu_analyzer_id") + if cu_analyzer_id is not None: + cu_args["analyzer_id"] = cu_analyzer_id + + cu_file_types = kwargs.get("cu_file_types") + if cu_file_types is not None: + cu_args["file_types"] = cu_file_types + + self.register_converter( + ContentUnderstandingConverter(**cu_args), + ) + self._builtins_enabled = True else: warn("Built-in converters are already enabled.", RuntimeWarning) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..77f8b1acd 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -21,6 +21,10 @@ DocumentIntelligenceConverter, DocumentIntelligenceFileType, ) +from ._cu_converter import ( + ContentUnderstandingConverter, + ContentUnderstandingFileType, +) from ._epub_converter import EpubConverter from ._csv_converter import CsvConverter @@ -43,6 +47,8 @@ "ZipConverter", "DocumentIntelligenceConverter", "DocumentIntelligenceFileType", + "ContentUnderstandingConverter", + "ContentUnderstandingFileType", "EpubConverter", "CsvConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py new file mode 100644 index 000000000..23bf3ae62 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -0,0 +1,485 @@ +"""Azure Content Understanding converter for MarkItDown. + +Converts files using Azure Content Understanding (CU) for high-quality, +multi-modal extraction with structured field output. Supports documents, +images, audio, and video. Fields are serialized as YAML front matter via +the CU SDK's ``to_llm_input()`` helper. + +Install dependencies: ``pip install markitdown[az-content-understanding]`` +""" + +import sys +import os +from typing import BinaryIO, Any, List, Optional, Dict +from enum import Enum + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException + +# Try loading optional dependencies — save error for later +_dependency_exc_info = None +try: + from azure.ai.contentunderstanding import ContentUnderstandingClient, to_llm_input + from azure.core.credentials import AzureKeyCredential, TokenCredential + from azure.core.pipeline.policies import UserAgentPolicy + from azure.identity import DefaultAzureCredential +except ImportError: + _dependency_exc_info = sys.exc_info() + + # Stub classes for type hinting + class AzureKeyCredential: # type: ignore[no-redef] + pass + + class TokenCredential: # type: ignore[no-redef] + pass + + class ContentUnderstandingClient: # type: ignore[no-redef] + pass + + class UserAgentPolicy: # type: ignore[no-redef] + pass + + class DefaultAzureCredential: # type: ignore[no-redef] + pass + + def to_llm_input(*args, **kwargs): # type: ignore[no-redef] + pass + + +# --------------------------------------------------------------------------- +# File type enum and routing tables +# --------------------------------------------------------------------------- + + +class ContentUnderstandingFileType(str, Enum): + """Supported file types for Content Understanding conversion.""" + + # Documents + PDF = "pdf" + DOCX = "docx" + PPTX = "pptx" + XLSX = "xlsx" + HTML = "html" + TXT = "txt" + MD = "md" + RTF = "rtf" + XML = "xml" + + # Email + EML = "eml" + MSG = "msg" + + # Images (document modality) + JPEG = "jpeg" + PNG = "png" + BMP = "bmp" + TIFF = "tiff" + HEIF = "heif" + + # Video + MP4 = "mp4" + M4V = "m4v" + MOV = "mov" + AVI = "avi" + MKV = "mkv" + WEBM_VIDEO = "webm-video" + FLV = "flv" + WMV = "wmv" + + # Audio + WAV = "wav" + MP3 = "mp3" + M4A = "m4a" + FLAC = "flac" + OGG = "ogg" + AAC = "aac" + WMA = "wma" + + +# Extension → file type +_EXTENSION_MAP: Dict[str, ContentUnderstandingFileType] = { + # Documents + ".pdf": ContentUnderstandingFileType.PDF, + ".docx": ContentUnderstandingFileType.DOCX, + ".pptx": ContentUnderstandingFileType.PPTX, + ".xlsx": ContentUnderstandingFileType.XLSX, + ".html": ContentUnderstandingFileType.HTML, + ".txt": ContentUnderstandingFileType.TXT, + ".md": ContentUnderstandingFileType.MD, + ".rtf": ContentUnderstandingFileType.RTF, + ".xml": ContentUnderstandingFileType.XML, + # Email + ".eml": ContentUnderstandingFileType.EML, + ".msg": ContentUnderstandingFileType.MSG, + # Images + ".jpg": ContentUnderstandingFileType.JPEG, + ".jpeg": ContentUnderstandingFileType.JPEG, + ".jpe": ContentUnderstandingFileType.JPEG, + ".png": ContentUnderstandingFileType.PNG, + ".bmp": ContentUnderstandingFileType.BMP, + ".tiff": ContentUnderstandingFileType.TIFF, + ".heif": ContentUnderstandingFileType.HEIF, + ".heic": ContentUnderstandingFileType.HEIF, + # Video + ".mp4": ContentUnderstandingFileType.MP4, + ".m4v": ContentUnderstandingFileType.M4V, + ".mov": ContentUnderstandingFileType.MOV, + ".avi": ContentUnderstandingFileType.AVI, + ".mkv": ContentUnderstandingFileType.MKV, + ".webm": ContentUnderstandingFileType.WEBM_VIDEO, + ".flv": ContentUnderstandingFileType.FLV, + ".wmv": ContentUnderstandingFileType.WMV, + # Audio + ".wav": ContentUnderstandingFileType.WAV, + ".mp3": ContentUnderstandingFileType.MP3, + ".m4a": ContentUnderstandingFileType.M4A, + ".flac": ContentUnderstandingFileType.FLAC, + ".ogg": ContentUnderstandingFileType.OGG, + ".aac": ContentUnderstandingFileType.AAC, + ".wma": ContentUnderstandingFileType.WMA, +} + +# MIME type prefixes for each file type +_MIME_PREFIXES: Dict[ContentUnderstandingFileType, List[str]] = { + # Documents + ContentUnderstandingFileType.PDF: ["application/pdf", "application/x-pdf"], + ContentUnderstandingFileType.DOCX: [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ], + ContentUnderstandingFileType.PPTX: [ + "application/vnd.openxmlformats-officedocument.presentationml" + ], + ContentUnderstandingFileType.XLSX: [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ], + ContentUnderstandingFileType.HTML: ["text/html", "application/xhtml+xml"], + ContentUnderstandingFileType.TXT: ["text/plain"], + ContentUnderstandingFileType.MD: ["text/markdown"], + ContentUnderstandingFileType.RTF: ["text/rtf", "application/rtf"], + ContentUnderstandingFileType.XML: ["text/xml", "application/xml"], + # Email + ContentUnderstandingFileType.EML: ["message/rfc822"], + ContentUnderstandingFileType.MSG: ["application/vnd.ms-outlook"], + # Images + ContentUnderstandingFileType.JPEG: ["image/jpeg"], + ContentUnderstandingFileType.PNG: ["image/png"], + ContentUnderstandingFileType.BMP: ["image/bmp"], + ContentUnderstandingFileType.TIFF: ["image/tiff"], + ContentUnderstandingFileType.HEIF: ["image/heif", "image/heic"], + # Video + ContentUnderstandingFileType.MP4: ["video/mp4"], + ContentUnderstandingFileType.M4V: ["video/x-m4v"], + ContentUnderstandingFileType.MOV: ["video/quicktime"], + ContentUnderstandingFileType.AVI: ["video/x-msvideo"], + ContentUnderstandingFileType.MKV: ["video/x-matroska"], + ContentUnderstandingFileType.WEBM_VIDEO: ["video/webm"], + ContentUnderstandingFileType.FLV: ["video/x-flv"], + ContentUnderstandingFileType.WMV: ["video/x-ms-wmv"], + # Audio + ContentUnderstandingFileType.WAV: ["audio/wav", "audio/x-wav"], + ContentUnderstandingFileType.MP3: ["audio/mpeg", "audio/mp3"], + ContentUnderstandingFileType.M4A: ["audio/mp4", "audio/m4a"], + ContentUnderstandingFileType.FLAC: ["audio/flac", "audio/x-flac"], + ContentUnderstandingFileType.OGG: ["audio/ogg"], + ContentUnderstandingFileType.AAC: ["audio/aac"], + ContentUnderstandingFileType.WMA: ["audio/x-ms-wma"], +} + +# File type → modality category +_DOCUMENT_TYPES = { + ContentUnderstandingFileType.PDF, + ContentUnderstandingFileType.DOCX, + ContentUnderstandingFileType.PPTX, + ContentUnderstandingFileType.XLSX, + ContentUnderstandingFileType.HTML, + ContentUnderstandingFileType.TXT, + ContentUnderstandingFileType.MD, + ContentUnderstandingFileType.RTF, + ContentUnderstandingFileType.XML, + ContentUnderstandingFileType.EML, + ContentUnderstandingFileType.MSG, + ContentUnderstandingFileType.JPEG, + ContentUnderstandingFileType.PNG, + ContentUnderstandingFileType.BMP, + ContentUnderstandingFileType.TIFF, + ContentUnderstandingFileType.HEIF, +} + +_VIDEO_TYPES = { + ContentUnderstandingFileType.MP4, + ContentUnderstandingFileType.M4V, + ContentUnderstandingFileType.MOV, + ContentUnderstandingFileType.AVI, + ContentUnderstandingFileType.MKV, + ContentUnderstandingFileType.WEBM_VIDEO, + ContentUnderstandingFileType.FLV, + ContentUnderstandingFileType.WMV, +} + +_AUDIO_TYPES = { + ContentUnderstandingFileType.WAV, + ContentUnderstandingFileType.MP3, + ContentUnderstandingFileType.M4A, + ContentUnderstandingFileType.FLAC, + ContentUnderstandingFileType.OGG, + ContentUnderstandingFileType.AAC, + ContentUnderstandingFileType.WMA, +} + +_DEFAULT_ANALYZERS = { + "document": "prebuilt-documentSearch", + "video": "prebuilt-videoSearch", + "audio": "prebuilt-audioSearch", +} + +# All supported file types (default set when file_types is None) +_ALL_FILE_TYPES = list(ContentUnderstandingFileType) + + +def _get_modality(file_type: ContentUnderstandingFileType) -> str: + """Get the modality category for a file type.""" + if file_type in _DOCUMENT_TYPES: + return "document" + elif file_type in _VIDEO_TYPES: + return "video" + elif file_type in _AUDIO_TYPES: + return "audio" + raise ValueError(f"Unknown file type: {file_type}") + + +# --------------------------------------------------------------------------- +# Smart routing: base_analyzer_id → modality mapping +# --------------------------------------------------------------------------- + +_BASE_TO_MODALITY: Dict[str, str] = { + "prebuilt-document": "document", + "prebuilt-image": "document", # CU images return kind="document" + "prebuilt-audio": "audio", + "prebuilt-video": "video", +} + +# For prebuilt analyzers, infer modality from name without an API call +_PREBUILT_MODALITY: Dict[str, str] = { + # Document-based prebuilts + "prebuilt-documentSearch": "document", + "prebuilt-layout": "document", + "prebuilt-read": "document", + "prebuilt-document": "document", + "prebuilt-invoice": "document", + "prebuilt-receipt": "document", + "prebuilt-receipt.generic": "document", + "prebuilt-receipt.hotel": "document", + "prebuilt-idDocument": "document", + "prebuilt-idDocument.generic": "document", + "prebuilt-idDocument.passport": "document", + "prebuilt-healthInsuranceCard.us": "document", + "prebuilt-contract": "document", + "prebuilt-creditCard": "document", + "prebuilt-creditMemo": "document", + "prebuilt-bankStatement.us": "document", + "prebuilt-check.us": "document", + "prebuilt-purchaseOrder": "document", + "prebuilt-procurement": "document", + "prebuilt-payStub.us": "document", + "prebuilt-utilityBill": "document", + "prebuilt-marriageCertificate.us": "document", + "prebuilt-documentFieldSchema": "document", + "prebuilt-documentFields": "document", + # Tax prebuilts (all document-based) + "prebuilt-tax.us": "document", + "prebuilt-tax.us.w2": "document", + "prebuilt-tax.us.w4": "document", + "prebuilt-tax.us.1040": "document", + # Mortgage prebuilts + "prebuilt-mortgage.us": "document", + "prebuilt-mortgage.us.1003": "document", + "prebuilt-mortgage.us.closingDisclosure": "document", + # Image-based prebuilts + "prebuilt-image": "document", # images are document modality in CU + "prebuilt-imageSearch": "document", + # Audio-based prebuilts + "prebuilt-audio": "audio", + "prebuilt-audioSearch": "audio", + "prebuilt-callCenter": "audio", + # Video-based prebuilts + "prebuilt-video": "video", + "prebuilt-videoSearch": "video", + "prebuilt-videoSynopsis": "video", +} + + +def _infer_prebuilt_modality(analyzer_id: str) -> str: + """Infer modality from a prebuilt analyzer ID without an API call.""" + if analyzer_id in _PREBUILT_MODALITY: + return _PREBUILT_MODALITY[analyzer_id] + # Unknown prebuilt — most prebuilts are document-based + return "document" + + +# --------------------------------------------------------------------------- +# Converter +# --------------------------------------------------------------------------- + + +class ContentUnderstandingConverter(DocumentConverter): + """Converts files using Azure Content Understanding. + + Provides high-quality document, image, audio, and video conversion + with structured field extraction via YAML front matter. + """ + + def __init__( + self, + *, + endpoint: str, + credential: AzureKeyCredential | TokenCredential | None = None, + analyzer_id: Optional[str] = None, + file_types: Optional[List[ContentUnderstandingFileType]] = None, + ): + """Initialize the Content Understanding converter. + + Args: + endpoint: CU resource endpoint URL. + credential: Explicit credential. If None, falls back to + AZURE_API_KEY env var, then DefaultAzureCredential. + analyzer_id: Custom analyzer for compatible file types. + When set, the converter checks the analyzer's base modality + (via get_analyzer() at init) and routes only compatible + file types to it. Incompatible modalities auto-route to + default prebuilts. If None, auto-selects by extension/MIME. + file_types: Which file types to handle. If None, uses the + default set (all supported formats). + """ + super().__init__() + + # Raise if dependencies are missing + if _dependency_exc_info is not None: + raise MissingDependencyException( + "ContentUnderstandingConverter requires the optional dependency " + "[az-content-understanding] (or [all]) to be installed. " + "E.g., `pip install markitdown[az-content-understanding]`" + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + + self._file_types = file_types if file_types is not None else _ALL_FILE_TYPES + self._analyzer_id = analyzer_id + self._analyzer_modality: Optional[str] = None + + # Resolve credential + if credential is None: + api_key = os.environ.get("AZURE_API_KEY") + if api_key is not None: + credential = AzureKeyCredential(api_key) + else: + credential = DefaultAzureCredential() + + # Build file type lookup sets + self._accepted_extensions = set() + self._accepted_mime_prefixes: List[str] = [] + for ft in self._file_types: + # Extensions + for ext, mapped_ft in _EXTENSION_MAP.items(): + if mapped_ft == ft: + self._accepted_extensions.add(ext) + # MIME prefixes + if ft in _MIME_PREFIXES: + self._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft]) + + # User agent for telemetry + try: + from ..__about__ import __version__ + except ImportError: + __version__ = "unknown" + user_agent = f"markitdown-cu/{__version__}" + + # Create CU client + self._client = ContentUnderstandingClient( + endpoint=endpoint, + credential=credential, + user_agent_policy=UserAgentPolicy(user_agent=user_agent), + ) + + # Smart routing: resolve analyzer modality at init + if self._analyzer_id is not None: + if self._analyzer_id.startswith("prebuilt-"): + # Infer from name — no API call + self._analyzer_modality = _infer_prebuilt_modality(self._analyzer_id) + else: + # Custom analyzer — one get_analyzer() call, cached + analyzer_info = self._client.get_analyzer(self._analyzer_id) + if analyzer_info.base_analyzer_id: + self._analyzer_modality = _BASE_TO_MODALITY.get( + analyzer_info.base_analyzer_id, "document" + ) + else: + self._analyzer_modality = "document" + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + """Return True if the file type is in the configured set.""" + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in self._accepted_extensions: + return True + + for prefix in self._accepted_mime_prefixes: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + """Convert the file using CU and return Markdown with YAML front matter.""" + + # 1. Determine analyzer_id (smart routing: check modality) + extension = (stream_info.extension or "").lower() + file_type = _EXTENSION_MAP.get(extension) + + if file_type is not None: + file_modality = _get_modality(file_type) + else: + # Fallback: try MIME type + file_modality = "document" + + if ( + self._analyzer_id is not None + and self._analyzer_modality is not None + and file_modality == self._analyzer_modality + ): + analyzer_id = self._analyzer_id + else: + analyzer_id = _DEFAULT_ANALYZERS.get(file_modality, "prebuilt-documentSearch") + + # 2. Read file bytes and determine MIME type + file_bytes = file_stream.read() + content_type = stream_info.mimetype or "application/octet-stream" + + # 3. Call CU SDK + poller = self._client.begin_analyze_binary( + analyzer_id=analyzer_id, + binary_input=file_bytes, + content_type=content_type, + ) + + # 4. Block on result + result = poller.result() + + # 5. Format output using to_llm_input() + text = to_llm_input(result) + + # 6. Return + return DocumentConverterResult(markdown=text) diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py new file mode 100644 index 000000000..2a9fba773 --- /dev/null +++ b/packages/markitdown/tests/test_cu_converter.py @@ -0,0 +1,329 @@ +"""Tests for ContentUnderstandingConverter. + +Tests accepts() routing, smart routing modality logic, and convert() via mocks. +Follows the same pattern as test_docintel_html.py. +""" + +import io +from unittest.mock import MagicMock, patch, PropertyMock + +import pytest + +from markitdown.converters._cu_converter import ( + ContentUnderstandingConverter, + ContentUnderstandingFileType, + _infer_prebuilt_modality, + _get_modality, + _EXTENSION_MAP, +) +from markitdown._stream_info import StreamInfo + + +# --------------------------------------------------------------------------- +# Helper: create a converter with accepts() working but no SDK init +# --------------------------------------------------------------------------- + +def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None): + """Create a converter bypassing __init__ (no SDK deps needed).""" + conv = ContentUnderstandingConverter.__new__(ContentUnderstandingConverter) + conv._analyzer_id = analyzer_id + conv._analyzer_modality = analyzer_modality + + # Build accepted extensions/mime from file_types + from markitdown.converters._cu_converter import ( + _ALL_FILE_TYPES, + _MIME_PREFIXES, + ) + + types = file_types if file_types is not None else _ALL_FILE_TYPES + conv._file_types = types + + conv._accepted_extensions = set() + conv._accepted_mime_prefixes = [] + for ft in types: + for ext, mapped_ft in _EXTENSION_MAP.items(): + if mapped_ft == ft: + conv._accepted_extensions.add(ext) + if ft in _MIME_PREFIXES: + conv._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft]) + + return conv + + +# --------------------------------------------------------------------------- +# accepts() tests — extension-based +# --------------------------------------------------------------------------- + +class TestAcceptsExtension: + """Test accepts() for supported and unsupported file extensions.""" + + @pytest.mark.parametrize("ext", [ + ".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".md", ".rtf", ".xml", + ".eml", ".msg", + ".jpg", ".jpeg", ".jpe", ".png", ".bmp", ".tiff", ".heif", ".heic", + ".mp4", ".m4v", ".mov", ".avi", ".mkv", ".webm", ".flv", ".wmv", + ".wav", ".mp3", ".m4a", ".flac", ".ogg", ".aac", ".wma", + ]) + def test_accepts_supported_extensions(self, ext): + conv = _make_converter() + assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext)) + + @pytest.mark.parametrize("ext", [".csv", ".json", ".zip", ".epub", ".py", ".rs"]) + def test_rejects_unsupported_extensions(self, ext): + conv = _make_converter() + assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext)) + + +# --------------------------------------------------------------------------- +# accepts() tests — MIME-based +# --------------------------------------------------------------------------- + +class TestAcceptsMime: + """Test accepts() for MIME type matching.""" + + @pytest.mark.parametrize("mime", [ + "application/pdf", + "image/jpeg", + "video/mp4", + "audio/wav", + "text/html", + "audio/mpeg", + "video/quicktime", + ]) + def test_accepts_supported_mimetypes(self, mime): + conv = _make_converter() + assert conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime)) + + @pytest.mark.parametrize("mime", [ + "text/csv", + "application/json", + "application/zip", + ]) + def test_rejects_unsupported_mimetypes(self, mime): + conv = _make_converter() + assert not conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime)) + + +# --------------------------------------------------------------------------- +# accepts() tests — cu_file_types restriction +# --------------------------------------------------------------------------- + +class TestAcceptsFileTypeRestriction: + """Test that cu_file_types restricts which formats are accepted.""" + + def test_restricted_to_pdf_only(self): + conv = _make_converter(file_types=[ContentUnderstandingFileType.PDF]) + assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf")) + assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp4")) + assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".wav")) + assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".jpg")) + + def test_restricted_to_audio(self): + conv = _make_converter(file_types=[ + ContentUnderstandingFileType.WAV, + ContentUnderstandingFileType.MP3, + ]) + assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".wav")) + assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp3")) + assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf")) + + +# --------------------------------------------------------------------------- +# Smart routing tests +# --------------------------------------------------------------------------- + +class TestSmartRouting: + """Test modality-aware analyzer routing.""" + + def test_document_analyzer_routes_pdf_to_custom(self): + """Document-based analyzer should be used for PDF.""" + conv = _make_converter( + analyzer_id="my-doc-analyzer", + analyzer_modality="document", + ) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf")) + + # Should use the custom analyzer for PDF (document modality) + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "my-doc-analyzer" + + def test_document_analyzer_routes_mp3_to_prebuilt(self): + """Document-based analyzer should auto-route MP3 to prebuilt-audioSearch.""" + conv = _make_converter( + analyzer_id="my-doc-analyzer", + analyzer_modality="document", + ) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert(io.BytesIO(b"fake audio"), StreamInfo(extension=".mp3", mimetype="audio/mpeg")) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "prebuilt-audioSearch" + + def test_document_analyzer_routes_mp4_to_prebuilt(self): + """Document-based analyzer should auto-route MP4 to prebuilt-videoSearch.""" + conv = _make_converter( + analyzer_id="my-doc-analyzer", + analyzer_modality="document", + ) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert(io.BytesIO(b"fake video"), StreamInfo(extension=".mp4", mimetype="video/mp4")) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "prebuilt-videoSearch" + + def test_no_analyzer_id_uses_auto_routing(self): + """Without analyzer_id, PDF should auto-route to prebuilt-documentSearch.""" + conv = _make_converter(analyzer_id=None, analyzer_modality=None) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf")) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch" + + +# --------------------------------------------------------------------------- +# _infer_prebuilt_modality tests +# --------------------------------------------------------------------------- + +class TestInferPrebuiltModality: + """Test modality inference from prebuilt analyzer names.""" + + def test_document_prebuilts(self): + assert _infer_prebuilt_modality("prebuilt-documentSearch") == "document" + assert _infer_prebuilt_modality("prebuilt-invoice") == "document" + assert _infer_prebuilt_modality("prebuilt-layout") == "document" + assert _infer_prebuilt_modality("prebuilt-receipt") == "document" + assert _infer_prebuilt_modality("prebuilt-tax.us.w2") == "document" + + def test_audio_prebuilts(self): + assert _infer_prebuilt_modality("prebuilt-audioSearch") == "audio" + assert _infer_prebuilt_modality("prebuilt-callCenter") == "audio" + + def test_video_prebuilts(self): + assert _infer_prebuilt_modality("prebuilt-videoSearch") == "video" + assert _infer_prebuilt_modality("prebuilt-videoSynopsis") == "video" + + def test_image_prebuilts_map_to_document(self): + assert _infer_prebuilt_modality("prebuilt-imageSearch") == "document" + assert _infer_prebuilt_modality("prebuilt-image") == "document" + + def test_unknown_prebuilt_defaults_to_document(self): + assert _infer_prebuilt_modality("prebuilt-unknownNewAnalyzer") == "document" + + +# --------------------------------------------------------------------------- +# _get_modality tests +# --------------------------------------------------------------------------- + +class TestGetModality: + """Test file type → modality mapping.""" + + def test_document_types(self): + assert _get_modality(ContentUnderstandingFileType.PDF) == "document" + assert _get_modality(ContentUnderstandingFileType.DOCX) == "document" + assert _get_modality(ContentUnderstandingFileType.JPEG) == "document" + + def test_video_types(self): + assert _get_modality(ContentUnderstandingFileType.MP4) == "video" + assert _get_modality(ContentUnderstandingFileType.MOV) == "video" + + def test_audio_types(self): + assert _get_modality(ContentUnderstandingFileType.WAV) == "audio" + assert _get_modality(ContentUnderstandingFileType.MP3) == "audio" + + +# --------------------------------------------------------------------------- +# convert() mock tests +# --------------------------------------------------------------------------- + +class TestConvertMock: + """Test convert() with mocked CU SDK.""" + + def _run_convert(self, extension, mimetype, expected_output="mock output"): + conv = _make_converter() + conv._client = MagicMock() + + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch( + "markitdown.converters._cu_converter.to_llm_input", + return_value=expected_output, + ): + result = conv.convert( + io.BytesIO(b"fake content"), + StreamInfo(extension=extension, mimetype=mimetype), + ) + return result + + def test_pdf_returns_markdown(self): + result = self._run_convert(".pdf", "application/pdf", "---\ncontentType: document\n---\n# Test") + assert "contentType: document" in result.markdown + + def test_mp4_returns_markdown(self): + result = self._run_convert(".mp4", "video/mp4", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hello") + assert "contentType: audioVisual" in result.markdown + + def test_wav_returns_markdown(self): + result = self._run_convert(".wav", "audio/wav", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hi") + assert "audioVisual" in result.markdown + + def test_empty_result(self): + result = self._run_convert(".pdf", "application/pdf", "") + assert result.markdown == "" + + +# --------------------------------------------------------------------------- +# MissingDependencyException test +# --------------------------------------------------------------------------- + +class TestMissingDependency: + """Test that MissingDependencyException is raised when CU SDK is not installed.""" + + def test_missing_deps_message(self): + """Verify the exception includes install hint.""" + # We can't easily simulate ImportError in the module, but we can check + # the exception message pattern if it were raised. + from markitdown._exceptions import MissingDependencyException + + exc = MissingDependencyException( + "ContentUnderstandingConverter requires the optional dependency " + "[az-content-understanding] (or [all]) to be installed." + ) + assert "az-content-understanding" in str(exc) From 1c70a82f937be6c3568f7115710fad1b7fd65359 Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Wed, 6 May 2026 14:24:50 -0700 Subject: [PATCH 2/9] improve mime type detection --- README.md | 9 + packages/markitdown/pyproject.toml | 4 +- .../markitdown/converters/_cu_converter.py | 111 +++++++---- .../markitdown/tests/test_cu_converter.py | 184 ++++++++++++++++-- 4 files changed, 249 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 7d4936b36..005d7bdc0 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,7 @@ At the moment, the following optional dependencies are available: * `[pdf]` Installs dependencies for PDF files * `[outlook]` Installs dependencies for Outlook messages * `[az-doc-intel]` Installs dependencies for Azure Document Intelligence +* `[az-content-understanding]` Installs dependencies for Azure Content Understanding * `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files * `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription @@ -174,6 +175,14 @@ More information about how to set up an Azure Document Intelligence Resource can Install: `pip install 'markitdown[az-content-understanding]'` +| Capability | Built-in converters | Azure Document Intelligence | Azure Content Understanding | +|------------|---------------------|-----------------------------|-----------------------------| +| Document conversion | Offline, format-specific extraction | Cloud layout extraction | Cloud multimodal extraction | +| Structured fields | Not available | Not exposed by this integration | YAML front matter from analyzer fields | +| Custom analyzers | Not available | Not configurable in this integration | Supported with `cu_analyzer_id` | +| Audio and video | Basic audio, no video | Not supported | Audio and video analyzers | +| Cost | Local compute only | Billable Azure API calls | Billable Azure API calls | + **CLI:** ```bash diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 84841cd03..8366c0754 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -47,7 +47,7 @@ all = [ "SpeechRecognition", "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", - "azure-ai-contentunderstanding", + "azure-ai-contentunderstanding>=1.2.0b1", "azure-identity", ] pptx = ["python-pptx"] @@ -59,7 +59,7 @@ outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] -az-content-understanding = ["azure-ai-contentunderstanding", "azure-identity"] +az-content-understanding = ["azure-ai-contentunderstanding>=1.2.0b1", "azure-identity"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py index 23bf3ae62..b2f6c80ba 100644 --- a/packages/markitdown/src/markitdown/converters/_cu_converter.py +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -83,7 +83,7 @@ class ContentUnderstandingFileType(str, Enum): MOV = "mov" AVI = "avi" MKV = "mkv" - WEBM_VIDEO = "webm-video" + WEBM = "webm" FLV = "flv" WMV = "wmv" @@ -127,7 +127,7 @@ class ContentUnderstandingFileType(str, Enum): ".mov": ContentUnderstandingFileType.MOV, ".avi": ContentUnderstandingFileType.AVI, ".mkv": ContentUnderstandingFileType.MKV, - ".webm": ContentUnderstandingFileType.WEBM_VIDEO, + ".webm": ContentUnderstandingFileType.WEBM, ".flv": ContentUnderstandingFileType.FLV, ".wmv": ContentUnderstandingFileType.WMV, # Audio @@ -173,19 +173,26 @@ class ContentUnderstandingFileType(str, Enum): ContentUnderstandingFileType.MOV: ["video/quicktime"], ContentUnderstandingFileType.AVI: ["video/x-msvideo"], ContentUnderstandingFileType.MKV: ["video/x-matroska"], - ContentUnderstandingFileType.WEBM_VIDEO: ["video/webm"], + ContentUnderstandingFileType.WEBM: ["video/webm"], ContentUnderstandingFileType.FLV: ["video/x-flv"], ContentUnderstandingFileType.WMV: ["video/x-ms-wmv"], # Audio ContentUnderstandingFileType.WAV: ["audio/wav", "audio/x-wav"], ContentUnderstandingFileType.MP3: ["audio/mpeg", "audio/mp3"], - ContentUnderstandingFileType.M4A: ["audio/mp4", "audio/m4a"], + ContentUnderstandingFileType.M4A: ["audio/mp4", "audio/m4a", "audio/x-m4a"], ContentUnderstandingFileType.FLAC: ["audio/flac", "audio/x-flac"], ContentUnderstandingFileType.OGG: ["audio/ogg"], ContentUnderstandingFileType.AAC: ["audio/aac"], ContentUnderstandingFileType.WMA: ["audio/x-ms-wma"], } +_MIME_ALIASES: Dict[str, str] = { + "audio/x-wav": "audio/wav", + "audio/x-flac": "audio/flac", + "audio/x-m4a": "audio/mp4", + "video/x-m4v": "video/mp4", +} + # File type → modality category _DOCUMENT_TYPES = { ContentUnderstandingFileType.PDF, @@ -212,7 +219,7 @@ class ContentUnderstandingFileType(str, Enum): ContentUnderstandingFileType.MOV, ContentUnderstandingFileType.AVI, ContentUnderstandingFileType.MKV, - ContentUnderstandingFileType.WEBM_VIDEO, + ContentUnderstandingFileType.WEBM, ContentUnderstandingFileType.FLV, ContentUnderstandingFileType.WMV, } @@ -248,6 +255,62 @@ def _get_modality(file_type: ContentUnderstandingFileType) -> str: raise ValueError(f"Unknown file type: {file_type}") +def _detect_file_type( + stream_info: StreamInfo, + file_types: Optional[List[ContentUnderstandingFileType]] = None, +) -> Optional[ContentUnderstandingFileType]: + """Detect a supported CU file type from extension or MIME type.""" + allowed = set(file_types) if file_types is not None else None + + extension = (stream_info.extension or "").lower() + file_type = _EXTENSION_MAP.get(extension) + if file_type is not None and (allowed is None or file_type in allowed): + return file_type + + mimetype = _clean_mime_type(stream_info.mimetype) + if not mimetype: + return None + + return _detect_file_type_from_mime(mimetype, allowed) + + +def _clean_mime_type(mimetype: Optional[str]) -> str: + return (mimetype or "").split(";", 1)[0].strip().lower() + + +def _canonical_mime_type(mimetype: Optional[str]) -> str: + cleaned = _clean_mime_type(mimetype) + return _MIME_ALIASES.get(cleaned, cleaned) or "application/octet-stream" + + +def _content_type_for( + file_type: ContentUnderstandingFileType, + mimetype: Optional[str], +) -> str: + content_type = _canonical_mime_type(mimetype) + if content_type != "application/octet-stream": + return content_type + + prefixes = _MIME_PREFIXES.get(file_type, []) + if not prefixes: + return content_type + + return _canonical_mime_type(prefixes[0]) + + +def _detect_file_type_from_mime( + mimetype: str, + allowed: Optional[set[ContentUnderstandingFileType]], +) -> Optional[ContentUnderstandingFileType]: + for candidate, prefixes in _MIME_PREFIXES.items(): + if allowed is not None and candidate not in allowed: + continue + for prefix in prefixes: + if mimetype.startswith(prefix): + return candidate + return None + + # --------------------------------------------------------------------------- # Smart routing: base_analyzer_id → modality mapping # --------------------------------------------------------------------------- @@ -377,18 +440,6 @@ def __init__( else: credential = DefaultAzureCredential() - # Build file type lookup sets - self._accepted_extensions = set() - self._accepted_mime_prefixes: List[str] = [] - for ft in self._file_types: - # Extensions - for ext, mapped_ft in _EXTENSION_MAP.items(): - if mapped_ft == ft: - self._accepted_extensions.add(ext) - # MIME prefixes - if ft in _MIME_PREFIXES: - self._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft]) - # User agent for telemetry try: from ..__about__ import __version__ @@ -425,17 +476,7 @@ def accepts( **kwargs: Any, ) -> bool: """Return True if the file type is in the configured set.""" - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in self._accepted_extensions: - return True - - for prefix in self._accepted_mime_prefixes: - if mimetype.startswith(prefix): - return True - - return False + return _detect_file_type(stream_info, self._file_types) is not None def convert( self, @@ -446,14 +487,10 @@ def convert( """Convert the file using CU and return Markdown with YAML front matter.""" # 1. Determine analyzer_id (smart routing: check modality) - extension = (stream_info.extension or "").lower() - file_type = _EXTENSION_MAP.get(extension) - - if file_type is not None: - file_modality = _get_modality(file_type) - else: - # Fallback: try MIME type - file_modality = "document" + file_type = _detect_file_type(stream_info, self._file_types) + if file_type is None: + raise ValueError("Unsupported file type for Content Understanding conversion.") + file_modality = _get_modality(file_type) if ( self._analyzer_id is not None @@ -466,7 +503,7 @@ def convert( # 2. Read file bytes and determine MIME type file_bytes = file_stream.read() - content_type = stream_info.mimetype or "application/octet-stream" + content_type = _content_type_for(file_type, stream_info.mimetype) # 3. Call CU SDK poller = self._client.begin_analyze_binary( diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py index 2a9fba773..b5e87ba34 100644 --- a/packages/markitdown/tests/test_cu_converter.py +++ b/packages/markitdown/tests/test_cu_converter.py @@ -5,7 +5,7 @@ """ import io -from unittest.mock import MagicMock, patch, PropertyMock +from unittest.mock import MagicMock, patch import pytest @@ -14,6 +14,9 @@ ContentUnderstandingFileType, _infer_prebuilt_modality, _get_modality, + _detect_file_type, + _canonical_mime_type, + _content_type_for, _EXTENSION_MAP, ) from markitdown._stream_info import StreamInfo @@ -29,24 +32,14 @@ def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None): conv._analyzer_id = analyzer_id conv._analyzer_modality = analyzer_modality - # Build accepted extensions/mime from file_types + # Set accepted file types without running SDK-dependent initialization. from markitdown.converters._cu_converter import ( _ALL_FILE_TYPES, - _MIME_PREFIXES, ) types = file_types if file_types is not None else _ALL_FILE_TYPES conv._file_types = types - conv._accepted_extensions = set() - conv._accepted_mime_prefixes = [] - for ft in types: - for ext, mapped_ft in _EXTENSION_MAP.items(): - if mapped_ft == ft: - conv._accepted_extensions.add(ext) - if ft in _MIME_PREFIXES: - conv._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft]) - return conv @@ -68,7 +61,9 @@ def test_accepts_supported_extensions(self, ext): conv = _make_converter() assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext)) - @pytest.mark.parametrize("ext", [".csv", ".json", ".zip", ".epub", ".py", ".rs"]) + @pytest.mark.parametrize("ext", [ + ".csv", ".json", ".zip", ".epub", ".py", ".rs", + ]) def test_rejects_unsupported_extensions(self, ext): conv = _make_converter() assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext)) @@ -86,9 +81,18 @@ class TestAcceptsMime: "image/jpeg", "video/mp4", "audio/wav", + "audio/x-wav", "text/html", "audio/mpeg", + "audio/x-m4a", + "audio/x-flac", "video/quicktime", + "video/webm", + "video/x-m4v", + "video/x-flv", + "video/x-ms-wmv", + "audio/aac", + "audio/x-ms-wma", ]) def test_accepts_supported_mimetypes(self, mime): conv = _make_converter() @@ -127,6 +131,69 @@ def test_restricted_to_audio(self): assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp3")) assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf")) + def test_webm_value_matches_cli_input(self): + assert ContentUnderstandingFileType("webm") == ContentUnderstandingFileType.WEBM + + def test_m4v_value_matches_cli_input(self): + assert ContentUnderstandingFileType("m4v") == ContentUnderstandingFileType.M4V + + +# --------------------------------------------------------------------------- +# file type detection tests +# --------------------------------------------------------------------------- + +class TestDetectFileType: + """Test extension and MIME based file type detection.""" + + def test_detects_video_from_mime_without_extension(self): + assert ( + _detect_file_type(StreamInfo(mimetype="video/mp4")) + == ContentUnderstandingFileType.MP4 + ) + + def test_detects_audio_from_mime_without_extension(self): + assert ( + _detect_file_type(StreamInfo(mimetype="audio/mpeg")) + == ContentUnderstandingFileType.MP3 + ) + + def test_detects_audio_alias_from_mime_without_extension(self): + assert ( + _detect_file_type(StreamInfo(mimetype="audio/x-wav")) + == ContentUnderstandingFileType.WAV + ) + + def test_detects_video_alias_from_mime_without_extension(self): + assert ( + _detect_file_type(StreamInfo(mimetype="video/x-m4v")) + == ContentUnderstandingFileType.M4V + ) + + @pytest.mark.parametrize(("mimetype", "expected"), [ + ("audio/x-wav", "audio/wav"), + ("audio/x-flac", "audio/flac"), + ("audio/x-m4a", "audio/mp4"), + ("video/x-m4v", "video/mp4"), + ("video/mp4", "video/mp4"), + (None, "application/octet-stream"), + ]) + def test_canonical_mime_type(self, mimetype, expected): + assert _canonical_mime_type(mimetype) == expected + + @pytest.mark.parametrize(("file_type", "mimetype", "expected"), [ + (ContentUnderstandingFileType.PDF, None, "application/pdf"), + (ContentUnderstandingFileType.M4V, None, "video/mp4"), + (ContentUnderstandingFileType.FLAC, "audio/x-flac", "audio/flac"), + ]) + def test_content_type_for(self, file_type, mimetype, expected): + assert _content_type_for(file_type, mimetype) == expected + + def test_file_type_restriction_applies_to_mime(self): + assert _detect_file_type( + StreamInfo(mimetype="video/mp4"), + [ContentUnderstandingFileType.PDF], + ) is None + # --------------------------------------------------------------------------- # Smart routing tests @@ -150,7 +217,10 @@ def test_document_analyzer_routes_pdf_to_custom(self): conv._client.begin_analyze_binary.return_value = mock_poller with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): - conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf")) + conv.convert( + io.BytesIO(b"fake pdf"), + StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) # Should use the custom analyzer for PDF (document modality) call_args = conv._client.begin_analyze_binary.call_args @@ -171,7 +241,10 @@ def test_document_analyzer_routes_mp3_to_prebuilt(self): conv._client.begin_analyze_binary.return_value = mock_poller with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): - conv.convert(io.BytesIO(b"fake audio"), StreamInfo(extension=".mp3", mimetype="audio/mpeg")) + conv.convert( + io.BytesIO(b"fake audio"), + StreamInfo(extension=".mp3", mimetype="audio/mpeg"), + ) call_args = conv._client.begin_analyze_binary.call_args assert call_args.kwargs["analyzer_id"] == "prebuilt-audioSearch" @@ -191,7 +264,10 @@ def test_document_analyzer_routes_mp4_to_prebuilt(self): conv._client.begin_analyze_binary.return_value = mock_poller with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): - conv.convert(io.BytesIO(b"fake video"), StreamInfo(extension=".mp4", mimetype="video/mp4")) + conv.convert( + io.BytesIO(b"fake video"), + StreamInfo(extension=".mp4", mimetype="video/mp4"), + ) call_args = conv._client.begin_analyze_binary.call_args assert call_args.kwargs["analyzer_id"] == "prebuilt-videoSearch" @@ -208,10 +284,72 @@ def test_no_analyzer_id_uses_auto_routing(self): conv._client.begin_analyze_binary.return_value = mock_poller with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): - conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf")) + conv.convert( + io.BytesIO(b"fake pdf"), + StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch" + + @pytest.mark.parametrize(("mimetype", "expected_analyzer"), [ + ("video/mp4", "prebuilt-videoSearch"), + ("video/x-m4v", "prebuilt-videoSearch"), + ("audio/mpeg", "prebuilt-audioSearch"), + ("audio/x-wav", "prebuilt-audioSearch"), + ]) + def test_mime_only_input_uses_auto_routing(self, mimetype, expected_analyzer): + """MIME-only streams should route to the matching modality analyzer.""" + conv = _make_converter(analyzer_id=None, analyzer_modality=None) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert(io.BytesIO(b"fake content"), StreamInfo(mimetype=mimetype)) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == expected_analyzer + + def test_mime_alias_input_uses_canonical_content_type(self): + """Alias MIME types should be sent to CU as canonical content types.""" + conv = _make_converter(analyzer_id=None, analyzer_modality=None) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert(io.BytesIO(b"fake video"), StreamInfo(mimetype="video/x-m4v")) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "prebuilt-videoSearch" + assert call_args.kwargs["content_type"] == "video/mp4" + + def test_extension_only_input_uses_file_type_content_type(self): + """Extension-only inputs should send CU a matching content type.""" + conv = _make_converter(analyzer_id=None, analyzer_modality=None) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf")) call_args = conv._client.begin_analyze_binary.call_args assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch" + assert call_args.kwargs["content_type"] == "application/pdf" # --------------------------------------------------------------------------- @@ -293,15 +431,21 @@ def _run_convert(self, extension, mimetype, expected_output="mock output"): return result def test_pdf_returns_markdown(self): - result = self._run_convert(".pdf", "application/pdf", "---\ncontentType: document\n---\n# Test") + result = self._run_convert( + ".pdf", "application/pdf", "---\ncontentType: document\n---\n# Test" + ) assert "contentType: document" in result.markdown def test_mp4_returns_markdown(self): - result = self._run_convert(".mp4", "video/mp4", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hello") + result = self._run_convert( + ".mp4", "video/mp4", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hello" + ) assert "contentType: audioVisual" in result.markdown def test_wav_returns_markdown(self): - result = self._run_convert(".wav", "audio/wav", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hi") + result = self._run_convert( + ".wav", "audio/wav", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hi" + ) assert "audioVisual" in result.markdown def test_empty_result(self): From 24ba4f2361fa3a24f59f82c672aa2a1a0907109d Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Wed, 6 May 2026 15:11:01 -0700 Subject: [PATCH 3/9] prebuilt-image custom analzyer route to image --- .../markitdown/converters/_cu_converter.py | 21 +++- .../markitdown/tests/test_cu_converter.py | 100 +++++++++++++++++- 2 files changed, 113 insertions(+), 8 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py index b2f6c80ba..b3a77494f 100644 --- a/packages/markitdown/src/markitdown/converters/_cu_converter.py +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -206,6 +206,9 @@ class ContentUnderstandingFileType(str, Enum): ContentUnderstandingFileType.XML, ContentUnderstandingFileType.EML, ContentUnderstandingFileType.MSG, +} + +_IMAGE_TYPES = { ContentUnderstandingFileType.JPEG, ContentUnderstandingFileType.PNG, ContentUnderstandingFileType.BMP, @@ -236,6 +239,7 @@ class ContentUnderstandingFileType(str, Enum): _DEFAULT_ANALYZERS = { "document": "prebuilt-documentSearch", + "image": "prebuilt-documentSearch", "video": "prebuilt-videoSearch", "audio": "prebuilt-audioSearch", } @@ -248,6 +252,8 @@ def _get_modality(file_type: ContentUnderstandingFileType) -> str: """Get the modality category for a file type.""" if file_type in _DOCUMENT_TYPES: return "document" + elif file_type in _IMAGE_TYPES: + return "image" elif file_type in _VIDEO_TYPES: return "video" elif file_type in _AUDIO_TYPES: @@ -317,7 +323,7 @@ def _detect_file_type_from_mime( _BASE_TO_MODALITY: Dict[str, str] = { "prebuilt-document": "document", - "prebuilt-image": "document", # CU images return kind="document" + "prebuilt-image": "image", "prebuilt-audio": "audio", "prebuilt-video": "video", } @@ -359,8 +365,8 @@ def _detect_file_type_from_mime( "prebuilt-mortgage.us.1003": "document", "prebuilt-mortgage.us.closingDisclosure": "document", # Image-based prebuilts - "prebuilt-image": "document", # images are document modality in CU - "prebuilt-imageSearch": "document", + "prebuilt-image": "image", + "prebuilt-imageSearch": "image", # Audio-based prebuilts "prebuilt-audio": "audio", "prebuilt-audioSearch": "audio", @@ -380,6 +386,13 @@ def _infer_prebuilt_modality(analyzer_id: str) -> str: return "document" +def _is_analyzer_compatible(file_modality: str, analyzer_modality: str) -> bool: + """Return True when an analyzer modality can process a file modality.""" + if analyzer_modality == "document": + return file_modality in {"document", "image"} + return file_modality == analyzer_modality + + # --------------------------------------------------------------------------- # Converter # --------------------------------------------------------------------------- @@ -495,7 +508,7 @@ def convert( if ( self._analyzer_id is not None and self._analyzer_modality is not None - and file_modality == self._analyzer_modality + and _is_analyzer_compatible(file_modality, self._analyzer_modality) ): analyzer_id = self._analyzer_id else: diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py index b5e87ba34..7cd556461 100644 --- a/packages/markitdown/tests/test_cu_converter.py +++ b/packages/markitdown/tests/test_cu_converter.py @@ -292,6 +292,95 @@ def test_no_analyzer_id_uses_auto_routing(self): call_args = conv._client.begin_analyze_binary.call_args assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch" + def test_no_analyzer_id_routes_image_to_document_search(self): + """Default image routing should still use prebuilt-documentSearch.""" + conv = _make_converter(analyzer_id=None, analyzer_modality=None) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert( + io.BytesIO(b"fake image"), + StreamInfo(extension=".jpg", mimetype="image/jpeg"), + ) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch" + + def test_document_analyzer_routes_image_to_custom(self): + """Document-based analyzers should still handle image documents.""" + conv = _make_converter( + analyzer_id="my-doc-analyzer", + analyzer_modality="document", + ) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert( + io.BytesIO(b"fake image"), + StreamInfo(extension=".jpg", mimetype="image/jpeg"), + ) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "my-doc-analyzer" + + def test_image_analyzer_routes_jpeg_to_custom(self): + """Image-based analyzers should be used for image files.""" + conv = _make_converter( + analyzer_id="my-image-analyzer", + analyzer_modality="image", + ) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert( + io.BytesIO(b"fake image"), + StreamInfo(extension=".jpg", mimetype="image/jpeg"), + ) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "my-image-analyzer" + + def test_image_analyzer_routes_pdf_to_document_prebuilt(self): + """Image-based analyzers should not claim non-image document files.""" + conv = _make_converter( + analyzer_id="my-image-analyzer", + analyzer_modality="image", + ) + conv._client = MagicMock() + mock_result = MagicMock() + mock_result.contents = [] + mock_poller = MagicMock() + mock_poller.result.return_value = mock_result + + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""): + conv.convert( + io.BytesIO(b"fake pdf"), + StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + call_args = conv._client.begin_analyze_binary.call_args + assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch" + @pytest.mark.parametrize(("mimetype", "expected_analyzer"), [ ("video/mp4", "prebuilt-videoSearch"), ("video/x-m4v", "prebuilt-videoSearch"), @@ -374,9 +463,9 @@ def test_video_prebuilts(self): assert _infer_prebuilt_modality("prebuilt-videoSearch") == "video" assert _infer_prebuilt_modality("prebuilt-videoSynopsis") == "video" - def test_image_prebuilts_map_to_document(self): - assert _infer_prebuilt_modality("prebuilt-imageSearch") == "document" - assert _infer_prebuilt_modality("prebuilt-image") == "document" + def test_image_prebuilts_map_to_image(self): + assert _infer_prebuilt_modality("prebuilt-imageSearch") == "image" + assert _infer_prebuilt_modality("prebuilt-image") == "image" def test_unknown_prebuilt_defaults_to_document(self): assert _infer_prebuilt_modality("prebuilt-unknownNewAnalyzer") == "document" @@ -392,7 +481,10 @@ class TestGetModality: def test_document_types(self): assert _get_modality(ContentUnderstandingFileType.PDF) == "document" assert _get_modality(ContentUnderstandingFileType.DOCX) == "document" - assert _get_modality(ContentUnderstandingFileType.JPEG) == "document" + + def test_image_types(self): + assert _get_modality(ContentUnderstandingFileType.JPEG) == "image" + assert _get_modality(ContentUnderstandingFileType.PNG) == "image" def test_video_types(self): assert _get_modality(ContentUnderstandingFileType.MP4) == "video" From d91d5ddb16062ef41881cb91e001d97083361304 Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Wed, 6 May 2026 16:07:06 -0700 Subject: [PATCH 4/9] enhance cu priority over di --- packages/markitdown/pyproject.toml | 1 + .../markitdown/src/markitdown/__main__.py | 19 +++--- .../markitdown/converters/_cu_converter.py | 7 +- .../markitdown/tests/test_cu_converter.py | 65 +++++++++++++++++++ 4 files changed, 82 insertions(+), 10 deletions(-) diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 8366c0754..d4c20a402 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -59,6 +59,7 @@ outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] +# >=1.2.0b1 required for to_llm_input() helper used by ContentUnderstandingConverter az-content-understanding = ["azure-ai-contentunderstanding>=1.2.0b1", "azure-identity"] [project.urls] diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index d57b2ae65..ac7d2f602 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -78,21 +78,15 @@ def main(): help="Provide a hint about the file's charset (e.g, UTF-8).", ) - parser.add_argument( + cloud_group = parser.add_mutually_exclusive_group() + cloud_group.add_argument( "-d", "--use-docintel", action="store_true", help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", ) - parser.add_argument( - "-e", - "--endpoint", - type=str, - help="Document Intelligence Endpoint. Required if using Document Intelligence.", - ) - - parser.add_argument( + cloud_group.add_argument( "--use-cu", "--use-content-understanding", action="store_true", @@ -100,6 +94,13 @@ def main(): help="Use Azure Content Understanding to extract text. Requires --cu-endpoint.", ) + parser.add_argument( + "-e", + "--endpoint", + type=str, + help="Document Intelligence Endpoint. Required if using Document Intelligence.", + ) + parser.add_argument( "--cu-endpoint", type=str, diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py index b3a77494f..de597f946 100644 --- a/packages/markitdown/src/markitdown/converters/_cu_converter.py +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -474,7 +474,12 @@ def __init__( self._analyzer_modality = _infer_prebuilt_modality(self._analyzer_id) else: # Custom analyzer — one get_analyzer() call, cached - analyzer_info = self._client.get_analyzer(self._analyzer_id) + try: + analyzer_info = self._client.get_analyzer(self._analyzer_id) + except Exception as exc: + raise ValueError( + f"Failed to resolve analyzer '{self._analyzer_id}': {exc}" + ) from exc if analyzer_info.base_analyzer_id: self._analyzer_modality = _BASE_TO_MODALITY.get( analyzer_info.base_analyzer_id, "document" diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py index 7cd556461..3eb88bbb4 100644 --- a/packages/markitdown/tests/test_cu_converter.py +++ b/packages/markitdown/tests/test_cu_converter.py @@ -545,6 +545,71 @@ def test_empty_result(self): assert result.markdown == "" +# --------------------------------------------------------------------------- +# Init-time get_analyzer() error wrapping +# --------------------------------------------------------------------------- + +class TestGetAnalyzerError: + """Test that get_analyzer() failures at init produce a clear error.""" + + def test_nonexistent_analyzer_raises_value_error(self): + """A failed get_analyzer() should raise ValueError with analyzer name.""" + with patch( + "markitdown.converters._cu_converter._dependency_exc_info", None + ), patch( + "markitdown.converters._cu_converter.ContentUnderstandingClient" + ) as MockClient, patch( + "markitdown.converters._cu_converter.DefaultAzureCredential" + ): + mock_client = MagicMock() + mock_client.get_analyzer.side_effect = Exception("not found") + MockClient.return_value = mock_client + + with pytest.raises(ValueError, match="Failed to resolve analyzer 'bad-id'"): + ContentUnderstandingConverter(endpoint="https://fake", analyzer_id="bad-id") + + +# --------------------------------------------------------------------------- +# Registration priority test +# --------------------------------------------------------------------------- + +class TestRegistrationPriority: + """Test that CU converter is registered with higher priority than Doc Intel.""" + + def test_cu_registered_before_docintel(self): + """When both endpoints are provided, CU should appear before Doc Intel.""" + with patch( + "markitdown.converters._cu_converter._dependency_exc_info", None + ), patch( + "markitdown.converters._cu_converter.ContentUnderstandingClient" + ), patch( + "markitdown.converters._cu_converter.DefaultAzureCredential" + ), patch( + "markitdown.converters._doc_intel_converter._dependency_exc_info", None + ), patch( + "markitdown.converters._doc_intel_converter.DocumentIntelligenceClient" + ), patch( + "markitdown.converters._doc_intel_converter.DefaultAzureCredential" + ): + from markitdown import MarkItDown + from markitdown.converters import ( + ContentUnderstandingConverter, + DocumentIntelligenceConverter, + ) + + md = MarkItDown( + cu_endpoint="https://fake-cu", + docintel_endpoint="https://fake-di", + ) + + converter_types = [ + type(reg.converter) for reg in md._converters + ] + cu_idx = converter_types.index(ContentUnderstandingConverter) + di_idx = converter_types.index(DocumentIntelligenceConverter) + assert cu_idx < di_idx, "CU should have higher priority (lower index) than Doc Intel" + + # --------------------------------------------------------------------------- # MissingDependencyException test # --------------------------------------------------------------------------- From f5e700838e358f46a21002130fe6882535e02872 Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Wed, 6 May 2026 16:33:46 -0700 Subject: [PATCH 5/9] fix: apply black formatting --- .../markitdown/src/markitdown/__main__.py | 4 +- .../markitdown/converters/_cu_converter.py | 8 +- .../markitdown/tests/test_cu_converter.py | 204 ++++++++++++------ 3 files changed, 148 insertions(+), 68 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ac7d2f602..ccb44b64b 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -228,7 +228,9 @@ def main(): # Parse comma-separated file types into ContentUnderstandingFileType list from .converters import ContentUnderstandingFileType - type_names = [t.strip().lower() for t in args.cu_file_types.split(",") if t.strip()] + type_names = [ + t.strip().lower() for t in args.cu_file_types.split(",") if t.strip() + ] cu_types = [] for name in type_names: # Try matching by value (e.g., "pdf", "jpeg", "mp4") diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py index de597f946..3b1380c14 100644 --- a/packages/markitdown/src/markitdown/converters/_cu_converter.py +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -507,7 +507,9 @@ def convert( # 1. Determine analyzer_id (smart routing: check modality) file_type = _detect_file_type(stream_info, self._file_types) if file_type is None: - raise ValueError("Unsupported file type for Content Understanding conversion.") + raise ValueError( + "Unsupported file type for Content Understanding conversion." + ) file_modality = _get_modality(file_type) if ( @@ -517,7 +519,9 @@ def convert( ): analyzer_id = self._analyzer_id else: - analyzer_id = _DEFAULT_ANALYZERS.get(file_modality, "prebuilt-documentSearch") + analyzer_id = _DEFAULT_ANALYZERS.get( + file_modality, "prebuilt-documentSearch" + ) # 2. Read file bytes and determine MIME type file_bytes = file_stream.read() diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py index 3eb88bbb4..70c51a5d1 100644 --- a/packages/markitdown/tests/test_cu_converter.py +++ b/packages/markitdown/tests/test_cu_converter.py @@ -26,6 +26,7 @@ # Helper: create a converter with accepts() working but no SDK init # --------------------------------------------------------------------------- + def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None): """Create a converter bypassing __init__ (no SDK deps needed).""" conv = ContentUnderstandingConverter.__new__(ContentUnderstandingConverter) @@ -47,23 +48,64 @@ def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None): # accepts() tests — extension-based # --------------------------------------------------------------------------- + class TestAcceptsExtension: """Test accepts() for supported and unsupported file extensions.""" - @pytest.mark.parametrize("ext", [ - ".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".md", ".rtf", ".xml", - ".eml", ".msg", - ".jpg", ".jpeg", ".jpe", ".png", ".bmp", ".tiff", ".heif", ".heic", - ".mp4", ".m4v", ".mov", ".avi", ".mkv", ".webm", ".flv", ".wmv", - ".wav", ".mp3", ".m4a", ".flac", ".ogg", ".aac", ".wma", - ]) + @pytest.mark.parametrize( + "ext", + [ + ".pdf", + ".docx", + ".pptx", + ".xlsx", + ".html", + ".txt", + ".md", + ".rtf", + ".xml", + ".eml", + ".msg", + ".jpg", + ".jpeg", + ".jpe", + ".png", + ".bmp", + ".tiff", + ".heif", + ".heic", + ".mp4", + ".m4v", + ".mov", + ".avi", + ".mkv", + ".webm", + ".flv", + ".wmv", + ".wav", + ".mp3", + ".m4a", + ".flac", + ".ogg", + ".aac", + ".wma", + ], + ) def test_accepts_supported_extensions(self, ext): conv = _make_converter() assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext)) - @pytest.mark.parametrize("ext", [ - ".csv", ".json", ".zip", ".epub", ".py", ".rs", - ]) + @pytest.mark.parametrize( + "ext", + [ + ".csv", + ".json", + ".zip", + ".epub", + ".py", + ".rs", + ], + ) def test_rejects_unsupported_extensions(self, ext): conv = _make_converter() assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext)) @@ -73,36 +115,43 @@ def test_rejects_unsupported_extensions(self, ext): # accepts() tests — MIME-based # --------------------------------------------------------------------------- + class TestAcceptsMime: """Test accepts() for MIME type matching.""" - @pytest.mark.parametrize("mime", [ - "application/pdf", - "image/jpeg", - "video/mp4", - "audio/wav", - "audio/x-wav", - "text/html", - "audio/mpeg", - "audio/x-m4a", - "audio/x-flac", - "video/quicktime", - "video/webm", - "video/x-m4v", - "video/x-flv", - "video/x-ms-wmv", - "audio/aac", - "audio/x-ms-wma", - ]) + @pytest.mark.parametrize( + "mime", + [ + "application/pdf", + "image/jpeg", + "video/mp4", + "audio/wav", + "audio/x-wav", + "text/html", + "audio/mpeg", + "audio/x-m4a", + "audio/x-flac", + "video/quicktime", + "video/webm", + "video/x-m4v", + "video/x-flv", + "video/x-ms-wmv", + "audio/aac", + "audio/x-ms-wma", + ], + ) def test_accepts_supported_mimetypes(self, mime): conv = _make_converter() assert conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime)) - @pytest.mark.parametrize("mime", [ - "text/csv", - "application/json", - "application/zip", - ]) + @pytest.mark.parametrize( + "mime", + [ + "text/csv", + "application/json", + "application/zip", + ], + ) def test_rejects_unsupported_mimetypes(self, mime): conv = _make_converter() assert not conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime)) @@ -112,6 +161,7 @@ def test_rejects_unsupported_mimetypes(self, mime): # accepts() tests — cu_file_types restriction # --------------------------------------------------------------------------- + class TestAcceptsFileTypeRestriction: """Test that cu_file_types restricts which formats are accepted.""" @@ -123,10 +173,12 @@ def test_restricted_to_pdf_only(self): assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".jpg")) def test_restricted_to_audio(self): - conv = _make_converter(file_types=[ - ContentUnderstandingFileType.WAV, - ContentUnderstandingFileType.MP3, - ]) + conv = _make_converter( + file_types=[ + ContentUnderstandingFileType.WAV, + ContentUnderstandingFileType.MP3, + ] + ) assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".wav")) assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp3")) assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf")) @@ -142,6 +194,7 @@ def test_m4v_value_matches_cli_input(self): # file type detection tests # --------------------------------------------------------------------------- + class TestDetectFileType: """Test extension and MIME based file type detection.""" @@ -169,36 +222,46 @@ def test_detects_video_alias_from_mime_without_extension(self): == ContentUnderstandingFileType.M4V ) - @pytest.mark.parametrize(("mimetype", "expected"), [ - ("audio/x-wav", "audio/wav"), - ("audio/x-flac", "audio/flac"), - ("audio/x-m4a", "audio/mp4"), - ("video/x-m4v", "video/mp4"), - ("video/mp4", "video/mp4"), - (None, "application/octet-stream"), - ]) + @pytest.mark.parametrize( + ("mimetype", "expected"), + [ + ("audio/x-wav", "audio/wav"), + ("audio/x-flac", "audio/flac"), + ("audio/x-m4a", "audio/mp4"), + ("video/x-m4v", "video/mp4"), + ("video/mp4", "video/mp4"), + (None, "application/octet-stream"), + ], + ) def test_canonical_mime_type(self, mimetype, expected): assert _canonical_mime_type(mimetype) == expected - @pytest.mark.parametrize(("file_type", "mimetype", "expected"), [ - (ContentUnderstandingFileType.PDF, None, "application/pdf"), - (ContentUnderstandingFileType.M4V, None, "video/mp4"), - (ContentUnderstandingFileType.FLAC, "audio/x-flac", "audio/flac"), - ]) + @pytest.mark.parametrize( + ("file_type", "mimetype", "expected"), + [ + (ContentUnderstandingFileType.PDF, None, "application/pdf"), + (ContentUnderstandingFileType.M4V, None, "video/mp4"), + (ContentUnderstandingFileType.FLAC, "audio/x-flac", "audio/flac"), + ], + ) def test_content_type_for(self, file_type, mimetype, expected): assert _content_type_for(file_type, mimetype) == expected def test_file_type_restriction_applies_to_mime(self): - assert _detect_file_type( - StreamInfo(mimetype="video/mp4"), - [ContentUnderstandingFileType.PDF], - ) is None + assert ( + _detect_file_type( + StreamInfo(mimetype="video/mp4"), + [ContentUnderstandingFileType.PDF], + ) + is None + ) # --------------------------------------------------------------------------- # Smart routing tests # --------------------------------------------------------------------------- + class TestSmartRouting: """Test modality-aware analyzer routing.""" @@ -381,12 +444,15 @@ def test_image_analyzer_routes_pdf_to_document_prebuilt(self): call_args = conv._client.begin_analyze_binary.call_args assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch" - @pytest.mark.parametrize(("mimetype", "expected_analyzer"), [ - ("video/mp4", "prebuilt-videoSearch"), - ("video/x-m4v", "prebuilt-videoSearch"), - ("audio/mpeg", "prebuilt-audioSearch"), - ("audio/x-wav", "prebuilt-audioSearch"), - ]) + @pytest.mark.parametrize( + ("mimetype", "expected_analyzer"), + [ + ("video/mp4", "prebuilt-videoSearch"), + ("video/x-m4v", "prebuilt-videoSearch"), + ("audio/mpeg", "prebuilt-audioSearch"), + ("audio/x-wav", "prebuilt-audioSearch"), + ], + ) def test_mime_only_input_uses_auto_routing(self, mimetype, expected_analyzer): """MIME-only streams should route to the matching modality analyzer.""" conv = _make_converter(analyzer_id=None, analyzer_modality=None) @@ -445,6 +511,7 @@ def test_extension_only_input_uses_file_type_content_type(self): # _infer_prebuilt_modality tests # --------------------------------------------------------------------------- + class TestInferPrebuiltModality: """Test modality inference from prebuilt analyzer names.""" @@ -475,6 +542,7 @@ def test_unknown_prebuilt_defaults_to_document(self): # _get_modality tests # --------------------------------------------------------------------------- + class TestGetModality: """Test file type → modality mapping.""" @@ -499,6 +567,7 @@ def test_audio_types(self): # convert() mock tests # --------------------------------------------------------------------------- + class TestConvertMock: """Test convert() with mocked CU SDK.""" @@ -549,6 +618,7 @@ def test_empty_result(self): # Init-time get_analyzer() error wrapping # --------------------------------------------------------------------------- + class TestGetAnalyzerError: """Test that get_analyzer() failures at init produce a clear error.""" @@ -566,13 +636,16 @@ def test_nonexistent_analyzer_raises_value_error(self): MockClient.return_value = mock_client with pytest.raises(ValueError, match="Failed to resolve analyzer 'bad-id'"): - ContentUnderstandingConverter(endpoint="https://fake", analyzer_id="bad-id") + ContentUnderstandingConverter( + endpoint="https://fake", analyzer_id="bad-id" + ) # --------------------------------------------------------------------------- # Registration priority test # --------------------------------------------------------------------------- + class TestRegistrationPriority: """Test that CU converter is registered with higher priority than Doc Intel.""" @@ -602,18 +675,19 @@ def test_cu_registered_before_docintel(self): docintel_endpoint="https://fake-di", ) - converter_types = [ - type(reg.converter) for reg in md._converters - ] + converter_types = [type(reg.converter) for reg in md._converters] cu_idx = converter_types.index(ContentUnderstandingConverter) di_idx = converter_types.index(DocumentIntelligenceConverter) - assert cu_idx < di_idx, "CU should have higher priority (lower index) than Doc Intel" + assert ( + cu_idx < di_idx + ), "CU should have higher priority (lower index) than Doc Intel" # --------------------------------------------------------------------------- # MissingDependencyException test # --------------------------------------------------------------------------- + class TestMissingDependency: """Test that MissingDependencyException is raised when CU SDK is not installed.""" From e4b585a83268d9897ffdd4a53f75c4431c261ec0 Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Thu, 7 May 2026 13:28:21 -0700 Subject: [PATCH 6/9] update cache of known prebuilt name and README improvement --- README.md | 9 + .../markitdown/converters/_cu_converter.py | 69 ++++--- .../markitdown/tests/test_cu_converter.py | 168 +++++++++++++++--- 3 files changed, 194 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 005d7bdc0..dd9edffdc 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,15 @@ More information about how to set up an Azure Document Intelligence Resource can Install: `pip install 'markitdown[az-content-understanding]'` +#### When to use Content Understanding + +Content Understanding is ideal when you need capabilities beyond what built-in or Document Intelligence converters provide: + +- **Audio and video files** — CU is the only option for converting MP4, MOV, WAV, MP3, and other media files. Built-in converters have no video support and only basic audio transcription. +- **Structured field extraction** — Custom analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields. +- **Higher-quality document extraction** — Cloud-based layout analysis and OCR for scanned PDFs, complex tables, and multi-page documents. +- **Single API for all modalities** — One `cu_endpoint` handles documents, images, audio, and video with automatic analyzer routing. + | Capability | Built-in converters | Azure Document Intelligence | Azure Content Understanding | |------------|---------------------|-----------------------------|-----------------------------| | Document conversion | Offline, format-specific extraction | Cloud layout extraction | Cloud multimodal extraction | diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py index 3b1380c14..cfe8fd979 100644 --- a/packages/markitdown/src/markitdown/converters/_cu_converter.py +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -237,7 +237,7 @@ class ContentUnderstandingFileType(str, Enum): ContentUnderstandingFileType.WMA, } -_DEFAULT_ANALYZERS = { +_PREBUILT_ANALYZERS = { "document": "prebuilt-documentSearch", "image": "prebuilt-documentSearch", "video": "prebuilt-videoSearch", @@ -328,8 +328,8 @@ def _detect_file_type_from_mime( "prebuilt-video": "video", } -# For prebuilt analyzers, infer modality from name without an API call -_PREBUILT_MODALITY: Dict[str, str] = { +# Cache of known prebuilt analyzer name → modality (avoids API call) +_KNOWN_PREBUILT_MODALITY: Dict[str, str] = { # Document-based prebuilts "prebuilt-documentSearch": "document", "prebuilt-layout": "document", @@ -378,11 +378,40 @@ def _detect_file_type_from_mime( } -def _infer_prebuilt_modality(analyzer_id: str) -> str: - """Infer modality from a prebuilt analyzer ID without an API call.""" - if analyzer_id in _PREBUILT_MODALITY: - return _PREBUILT_MODALITY[analyzer_id] - # Unknown prebuilt — most prebuilts are document-based +def _resolve_analyzer_modality(client: Any, analyzer_id: str) -> str: + """Resolve analyzer modality from cache or via get_analyzer() fallback. + + For known prebuilt-* names, returns the modality from + ``_KNOWN_PREBUILT_MODALITY`` without an API call. For unknown + prebuilt-* names or custom analyzers, calls ``get_analyzer()`` + to inspect ``base_analyzer_id``. + + Args: + client: A ``ContentUnderstandingClient`` instance. + analyzer_id: The analyzer ID to resolve. + + Returns: + Modality string ("document", "image", "audio", or "video"). + + Raises: + ValueError: If ``get_analyzer()`` fails. + """ + # Known prebuilt — use cache, no API call + if analyzer_id in _KNOWN_PREBUILT_MODALITY: + return _KNOWN_PREBUILT_MODALITY[analyzer_id] + + # Unknown prebuilt or custom analyzer — call get_analyzer() + try: + analyzer_info = client.get_analyzer(analyzer_id) + except Exception as exc: + raise ValueError( + f"Failed to resolve analyzer '{analyzer_id}': {exc}" + ) from exc + + if analyzer_info.base_analyzer_id: + return _BASE_TO_MODALITY.get( + analyzer_info.base_analyzer_id, "document" + ) return "document" @@ -467,25 +496,11 @@ def __init__( user_agent_policy=UserAgentPolicy(user_agent=user_agent), ) - # Smart routing: resolve analyzer modality at init + # Smart routing: resolve analyzer modality at init (at most one API call) if self._analyzer_id is not None: - if self._analyzer_id.startswith("prebuilt-"): - # Infer from name — no API call - self._analyzer_modality = _infer_prebuilt_modality(self._analyzer_id) - else: - # Custom analyzer — one get_analyzer() call, cached - try: - analyzer_info = self._client.get_analyzer(self._analyzer_id) - except Exception as exc: - raise ValueError( - f"Failed to resolve analyzer '{self._analyzer_id}': {exc}" - ) from exc - if analyzer_info.base_analyzer_id: - self._analyzer_modality = _BASE_TO_MODALITY.get( - analyzer_info.base_analyzer_id, "document" - ) - else: - self._analyzer_modality = "document" + self._analyzer_modality = _resolve_analyzer_modality( + self._client, self._analyzer_id + ) def accepts( self, @@ -519,7 +534,7 @@ def convert( ): analyzer_id = self._analyzer_id else: - analyzer_id = _DEFAULT_ANALYZERS.get( + analyzer_id = _PREBUILT_ANALYZERS.get( file_modality, "prebuilt-documentSearch" ) diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py index 70c51a5d1..2f4c5e8d4 100644 --- a/packages/markitdown/tests/test_cu_converter.py +++ b/packages/markitdown/tests/test_cu_converter.py @@ -5,6 +5,7 @@ """ import io +import sys from unittest.mock import MagicMock, patch import pytest @@ -12,7 +13,7 @@ from markitdown.converters._cu_converter import ( ContentUnderstandingConverter, ContentUnderstandingFileType, - _infer_prebuilt_modality, + _resolve_analyzer_modality, _get_modality, _detect_file_type, _canonical_mime_type, @@ -512,30 +513,76 @@ def test_extension_only_input_uses_file_type_content_type(self): # --------------------------------------------------------------------------- -class TestInferPrebuiltModality: - """Test modality inference from prebuilt analyzer names.""" - - def test_document_prebuilts(self): - assert _infer_prebuilt_modality("prebuilt-documentSearch") == "document" - assert _infer_prebuilt_modality("prebuilt-invoice") == "document" - assert _infer_prebuilt_modality("prebuilt-layout") == "document" - assert _infer_prebuilt_modality("prebuilt-receipt") == "document" - assert _infer_prebuilt_modality("prebuilt-tax.us.w2") == "document" - - def test_audio_prebuilts(self): - assert _infer_prebuilt_modality("prebuilt-audioSearch") == "audio" - assert _infer_prebuilt_modality("prebuilt-callCenter") == "audio" - - def test_video_prebuilts(self): - assert _infer_prebuilt_modality("prebuilt-videoSearch") == "video" - assert _infer_prebuilt_modality("prebuilt-videoSynopsis") == "video" - - def test_image_prebuilts_map_to_image(self): - assert _infer_prebuilt_modality("prebuilt-imageSearch") == "image" - assert _infer_prebuilt_modality("prebuilt-image") == "image" - - def test_unknown_prebuilt_defaults_to_document(self): - assert _infer_prebuilt_modality("prebuilt-unknownNewAnalyzer") == "document" +class TestResolveAnalyzerModality: + """Test modality resolution from analyzer IDs.""" + + def test_known_document_prebuilts(self): + client = MagicMock() + assert _resolve_analyzer_modality(client, "prebuilt-documentSearch") == "document" + assert _resolve_analyzer_modality(client, "prebuilt-invoice") == "document" + assert _resolve_analyzer_modality(client, "prebuilt-layout") == "document" + assert _resolve_analyzer_modality(client, "prebuilt-receipt") == "document" + assert _resolve_analyzer_modality(client, "prebuilt-tax.us.w2") == "document" + # Known prebuilts should never call get_analyzer() + client.get_analyzer.assert_not_called() + + def test_known_audio_prebuilts(self): + client = MagicMock() + assert _resolve_analyzer_modality(client, "prebuilt-audioSearch") == "audio" + assert _resolve_analyzer_modality(client, "prebuilt-callCenter") == "audio" + client.get_analyzer.assert_not_called() + + def test_known_video_prebuilts(self): + client = MagicMock() + assert _resolve_analyzer_modality(client, "prebuilt-videoSearch") == "video" + assert _resolve_analyzer_modality(client, "prebuilt-videoSynopsis") == "video" + client.get_analyzer.assert_not_called() + + def test_known_image_prebuilts(self): + client = MagicMock() + assert _resolve_analyzer_modality(client, "prebuilt-imageSearch") == "image" + assert _resolve_analyzer_modality(client, "prebuilt-image") == "image" + client.get_analyzer.assert_not_called() + + def test_unknown_prebuilt_falls_back_to_get_analyzer(self): + """Unknown prebuilt-* names should call get_analyzer() for resolution.""" + client = MagicMock() + mock_analyzer = MagicMock() + mock_analyzer.base_analyzer_id = "prebuilt-audio" + client.get_analyzer.return_value = mock_analyzer + + result = _resolve_analyzer_modality(client, "prebuilt-newAnalyzer") + assert result == "audio" + client.get_analyzer.assert_called_once_with("prebuilt-newAnalyzer") + + def test_custom_analyzer_calls_get_analyzer(self): + """Custom analyzers should call get_analyzer() to resolve modality.""" + client = MagicMock() + mock_analyzer = MagicMock() + mock_analyzer.base_analyzer_id = "prebuilt-document" + client.get_analyzer.return_value = mock_analyzer + + result = _resolve_analyzer_modality(client, "my-custom-doc-analyzer") + assert result == "document" + client.get_analyzer.assert_called_once_with("my-custom-doc-analyzer") + + def test_custom_analyzer_no_base_defaults_to_document(self): + """Analyzer with no base_analyzer_id defaults to document.""" + client = MagicMock() + mock_analyzer = MagicMock() + mock_analyzer.base_analyzer_id = None + client.get_analyzer.return_value = mock_analyzer + + result = _resolve_analyzer_modality(client, "my-custom-analyzer") + assert result == "document" + + def test_get_analyzer_failure_raises_value_error(self): + """Failed get_analyzer() should raise ValueError.""" + client = MagicMock() + client.get_analyzer.side_effect = Exception("not found") + + with pytest.raises(ValueError, match="Failed to resolve analyzer 'bad-id'"): + _resolve_analyzer_modality(client, "bad-id") # --------------------------------------------------------------------------- @@ -613,6 +660,12 @@ def test_empty_result(self): result = self._run_convert(".pdf", "application/pdf", "") assert result.markdown == "" + def test_jpeg_returns_markdown(self): + result = self._run_convert( + ".jpg", "image/jpeg", "---\ncontentType: document\n---\n# Photo" + ) + assert "contentType: document" in result.markdown + # --------------------------------------------------------------------------- # Init-time get_analyzer() error wrapping @@ -683,6 +736,71 @@ def test_cu_registered_before_docintel(self): ), "CU should have higher priority (lower index) than Doc Intel" +# --------------------------------------------------------------------------- +# CLI argument tests +# --------------------------------------------------------------------------- + + +class TestCLIArgs: + """Test CLI argument parsing for CU flags.""" + + def test_use_cu_without_endpoint_exits(self): + """--use-cu without --cu-endpoint should exit with error.""" + import subprocess + + result = subprocess.run( + [sys.executable, "-m", "markitdown", "--use-cu", "fake.pdf"], + capture_output=True, + text=True, + ) + assert result.returncode != 0 + assert "cu-endpoint" in result.stderr.lower() or "cu-endpoint" in (result.stdout or "").lower() + + def test_use_cu_and_use_docintel_mutually_exclusive(self): + """--use-cu and --use-docintel cannot be used together.""" + import subprocess + + result = subprocess.run( + [ + sys.executable, "-m", "markitdown", + "--use-cu", "--cu-endpoint", "https://fake", + "--use-docintel", "-e", "https://fake-di", + "fake.pdf", + ], + capture_output=True, + text=True, + ) + assert result.returncode != 0 + + def test_cu_file_types_parsing(self): + """--cu-file-types should parse comma-separated values into enum list.""" + from markitdown.converters import ContentUnderstandingFileType + + raw = "pdf,jpeg,mp4" + type_names = [t.strip().lower() for t in raw.split(",") if t.strip()] + cu_types = [ContentUnderstandingFileType(name) for name in type_names] + + assert cu_types == [ + ContentUnderstandingFileType.PDF, + ContentUnderstandingFileType.JPEG, + ContentUnderstandingFileType.MP4, + ] + + def test_cu_file_types_invalid_value(self): + """Unknown file type name should raise ValueError.""" + from markitdown.converters import ContentUnderstandingFileType + + with pytest.raises(ValueError): + ContentUnderstandingFileType("nonsense") + + def test_cu_file_types_single_value(self): + """Single file type (no comma) should parse correctly.""" + from markitdown.converters import ContentUnderstandingFileType + + cu_types = [ContentUnderstandingFileType(t.strip().lower()) for t in "wav".split(",") if t.strip()] + assert cu_types == [ContentUnderstandingFileType.WAV] + + # --------------------------------------------------------------------------- # MissingDependencyException test # --------------------------------------------------------------------------- From 6c7f5e78437b43bd84e70e5f6f86cb8f8170e114 Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Thu, 7 May 2026 13:58:27 -0700 Subject: [PATCH 7/9] add test cases, run black --- README.md | 2 +- .../markitdown/converters/_cu_converter.py | 8 +- .../markitdown/tests/test_cu_converter.py | 87 ++++++++++++++++--- 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index dd9edffdc..a099e65b4 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ Install: `pip install 'markitdown[az-content-understanding]'` Content Understanding is ideal when you need capabilities beyond what built-in or Document Intelligence converters provide: -- **Audio and video files** — CU is the only option for converting MP4, MOV, WAV, MP3, and other media files. Built-in converters have no video support and only basic audio transcription. +- **Audio and video files** — CU is the only option for video, and the higher-quality cloud option for audio. Built-in converters have no video support and only basic audio transcription. - **Structured field extraction** — Custom analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields. - **Higher-quality document extraction** — Cloud-based layout analysis and OCR for scanned PDFs, complex tables, and multi-page documents. - **Single API for all modalities** — One `cu_endpoint` handles documents, images, audio, and video with automatic analyzer routing. diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py index cfe8fd979..d3c70494f 100644 --- a/packages/markitdown/src/markitdown/converters/_cu_converter.py +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -404,14 +404,10 @@ def _resolve_analyzer_modality(client: Any, analyzer_id: str) -> str: try: analyzer_info = client.get_analyzer(analyzer_id) except Exception as exc: - raise ValueError( - f"Failed to resolve analyzer '{analyzer_id}': {exc}" - ) from exc + raise ValueError(f"Failed to resolve analyzer '{analyzer_id}': {exc}") from exc if analyzer_info.base_analyzer_id: - return _BASE_TO_MODALITY.get( - analyzer_info.base_analyzer_id, "document" - ) + return _BASE_TO_MODALITY.get(analyzer_info.base_analyzer_id, "document") return "document" diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py index 2f4c5e8d4..0cfd31aac 100644 --- a/packages/markitdown/tests/test_cu_converter.py +++ b/packages/markitdown/tests/test_cu_converter.py @@ -518,7 +518,9 @@ class TestResolveAnalyzerModality: def test_known_document_prebuilts(self): client = MagicMock() - assert _resolve_analyzer_modality(client, "prebuilt-documentSearch") == "document" + assert ( + _resolve_analyzer_modality(client, "prebuilt-documentSearch") == "document" + ) assert _resolve_analyzer_modality(client, "prebuilt-invoice") == "document" assert _resolve_analyzer_modality(client, "prebuilt-layout") == "document" assert _resolve_analyzer_modality(client, "prebuilt-receipt") == "document" @@ -754,7 +756,10 @@ def test_use_cu_without_endpoint_exits(self): text=True, ) assert result.returncode != 0 - assert "cu-endpoint" in result.stderr.lower() or "cu-endpoint" in (result.stdout or "").lower() + assert ( + "cu-endpoint" in result.stderr.lower() + or "cu-endpoint" in (result.stdout or "").lower() + ) def test_use_cu_and_use_docintel_mutually_exclusive(self): """--use-cu and --use-docintel cannot be used together.""" @@ -762,9 +767,15 @@ def test_use_cu_and_use_docintel_mutually_exclusive(self): result = subprocess.run( [ - sys.executable, "-m", "markitdown", - "--use-cu", "--cu-endpoint", "https://fake", - "--use-docintel", "-e", "https://fake-di", + sys.executable, + "-m", + "markitdown", + "--use-cu", + "--cu-endpoint", + "https://fake", + "--use-docintel", + "-e", + "https://fake-di", "fake.pdf", ], capture_output=True, @@ -797,9 +808,53 @@ def test_cu_file_types_single_value(self): """Single file type (no comma) should parse correctly.""" from markitdown.converters import ContentUnderstandingFileType - cu_types = [ContentUnderstandingFileType(t.strip().lower()) for t in "wav".split(",") if t.strip()] + cu_types = [ + ContentUnderstandingFileType(t.strip().lower()) + for t in "wav".split(",") + if t.strip() + ] assert cu_types == [ContentUnderstandingFileType.WAV] + def test_use_cu_wires_kwargs_to_markitdown(self, capsys): + """--use-cu should pass CU options through to MarkItDown.""" + import markitdown.__main__ as markitdown_cli + + markitdown_instance = MagicMock() + markitdown_instance.convert.return_value.markdown = "converted" + markitdown_cls = MagicMock(return_value=markitdown_instance) + + with patch.object( + sys, + "argv", + [ + "markitdown", + "--use-cu", + "--cu-endpoint", + "https://fake-cu", + "--cu-analyzer", + "custom-analyzer", + "--cu-file-types", + "pdf,jpeg,mp4", + "fake.pdf", + ], + ), patch.object(markitdown_cli, "MarkItDown", markitdown_cls): + markitdown_cli.main() + + markitdown_cls.assert_called_once_with( + enable_plugins=False, + cu_endpoint="https://fake-cu", + cu_analyzer_id="custom-analyzer", + cu_file_types=[ + ContentUnderstandingFileType.PDF, + ContentUnderstandingFileType.JPEG, + ContentUnderstandingFileType.MP4, + ], + ) + markitdown_instance.convert.assert_called_once_with( + "fake.pdf", stream_info=None, keep_data_uris=False + ) + assert capsys.readouterr().out == "converted\n" + # --------------------------------------------------------------------------- # MissingDependencyException test @@ -810,13 +865,17 @@ class TestMissingDependency: """Test that MissingDependencyException is raised when CU SDK is not installed.""" def test_missing_deps_message(self): - """Verify the exception includes install hint.""" - # We can't easily simulate ImportError in the module, but we can check - # the exception message pattern if it were raised. + """Converter construction should surface the optional install hint.""" + import markitdown.converters._cu_converter as cu_converter_module from markitdown._exceptions import MissingDependencyException - exc = MissingDependencyException( - "ContentUnderstandingConverter requires the optional dependency " - "[az-content-understanding] (or [all]) to be installed." - ) - assert "az-content-understanding" in str(exc) + import_error = ImportError("No module named 'azure.ai.contentunderstanding'") + dependency_exc_info = (ImportError, import_error, None) + + with patch.object( + cu_converter_module, "_dependency_exc_info", dependency_exc_info + ), pytest.raises(MissingDependencyException) as exc_info: + ContentUnderstandingConverter(endpoint="https://fake-cu") + + assert "az-content-understanding" in str(exc_info.value) + assert exc_info.value.__cause__ is import_error From 7a804cf34b39c1293e61199eebf25e55b39f6a2d Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Thu, 7 May 2026 16:58:11 -0700 Subject: [PATCH 8/9] update readme and deriving content_type from the resolved file_type --- README.md | 22 ++++----- .../markitdown/converters/_cu_converter.py | 29 ++++++++--- .../markitdown/tests/test_cu_converter.py | 49 ++++++++++++++++++- 3 files changed, 81 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index a099e65b4..12bbe3ad9 100644 --- a/README.md +++ b/README.md @@ -159,19 +159,9 @@ If no `llm_client` is provided the plugin still loads, but OCR is silently skipp See [`packages/markitdown-ocr/README.md`](packages/markitdown-ocr/README.md) for detailed documentation. -### Azure Document Intelligence - -To use Microsoft Document Intelligence for conversion: - -```bash -markitdown path-to-file.pdf -o document.md -d -e "" -``` - -More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0) - ### Azure Content Understanding -[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and configurable analyzers. +[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) is the next iteration of Azure Document Intelligence and is the recommended cloud option for new projects. It provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and [prebuilt](https://learn.microsoft.com/azure/ai-services/content-understanding/concepts/prebuilt-analyzers) or [custom-built](https://learn.microsoft.com/azure/ai-services/content-understanding/how-to/customize-analyzer-content-understanding-studio?tabs=portal) analyzers. Install: `pip install 'markitdown[az-content-understanding]'` @@ -246,6 +236,16 @@ md = MarkItDown( More information about Azure Content Understanding can be found [here](https://learn.microsoft.com/azure/ai-services/content-understanding/). +### Azure Document Intelligence + +To use Microsoft Document Intelligence for conversion: + +```bash +markitdown path-to-file.pdf -o document.md -d -e "" +``` + +More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0) + ### Python API Basic usage in Python: diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py index d3c70494f..e4080dda7 100644 --- a/packages/markitdown/src/markitdown/converters/_cu_converter.py +++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py @@ -293,15 +293,30 @@ def _content_type_for( file_type: ContentUnderstandingFileType, mimetype: Optional[str], ) -> str: - content_type = _canonical_mime_type(mimetype) - if content_type != "application/octet-stream": - return content_type - + """Resolve the content type to send to the CU API. + + Uses the resolved ``file_type`` as the source of truth so analyzer + routing and payload metadata stay consistent. The caller-provided + ``mimetype`` is only used when it is consistent with ``file_type`` + (e.g., to preserve subtype distinctions like ``image/heic`` vs + ``image/heif``). When ``mimetype`` disagrees with the resolved + ``file_type`` (e.g., ``.pdf`` extension with ``audio/mpeg`` + mimetype), the canonical MIME type for ``file_type`` is used. + """ prefixes = _MIME_PREFIXES.get(file_type, []) - if not prefixes: - return content_type + canonical = _canonical_mime_type(mimetype) + + # Use caller-provided MIME if it's consistent with the resolved file_type + if prefixes and canonical != "application/octet-stream": + for prefix in prefixes: + if canonical.startswith(prefix): + return canonical + + # Fallback: derive from the resolved file_type (single source of truth) + if prefixes: + return _canonical_mime_type(prefixes[0]) - return _canonical_mime_type(prefixes[0]) + return canonical def _detect_file_type_from_mime( diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py index 0cfd31aac..760f5fcc7 100644 --- a/packages/markitdown/tests/test_cu_converter.py +++ b/packages/markitdown/tests/test_cu_converter.py @@ -22,7 +22,6 @@ ) from markitdown._stream_info import StreamInfo - # --------------------------------------------------------------------------- # Helper: create a converter with accepts() working but no SDK init # --------------------------------------------------------------------------- @@ -248,6 +247,54 @@ def test_canonical_mime_type(self, mimetype, expected): def test_content_type_for(self, file_type, mimetype, expected): assert _content_type_for(file_type, mimetype) == expected + @pytest.mark.parametrize( + ("file_type", "mimetype", "expected"), + [ + # Extension/file_type wins when mimetype disagrees — the + # resolved file_type is the single source of truth so that + # analyzer routing and payload metadata stay consistent. + (ContentUnderstandingFileType.PDF, "audio/mpeg", "application/pdf"), + (ContentUnderstandingFileType.MP3, "application/pdf", "audio/mpeg"), + (ContentUnderstandingFileType.MP4, "image/jpeg", "video/mp4"), + (ContentUnderstandingFileType.JPEG, "video/mp4", "image/jpeg"), + # Subtype distinctions are preserved when consistent + # (e.g., HEIC vs HEIF both map to file_type HEIF; if the + # caller passed image/heic explicitly, keep it). + (ContentUnderstandingFileType.HEIF, "image/heic", "image/heic"), + (ContentUnderstandingFileType.HEIF, "image/heif", "image/heif"), + ], + ) + def test_content_type_for_resolves_conflicts_to_file_type( + self, file_type, mimetype, expected + ): + """When extension and mimetype disagree, file_type wins.""" + assert _content_type_for(file_type, mimetype) == expected + + def test_conflicting_extension_and_mimetype_in_convert(self): + """End-to-end: conflicting StreamInfo routes by extension and + sends a content_type consistent with the resolved file_type.""" + conv = _make_converter() + conv._client = MagicMock() + mock_poller = MagicMock() + mock_poller.result.return_value = MagicMock(contents=[]) + conv._client.begin_analyze_binary.return_value = mock_poller + + with patch( + "markitdown.converters._cu_converter.to_llm_input", + return_value="ok", + ): + conv.convert( + io.BytesIO(b"fake"), + # .pdf extension but bogus audio mimetype + StreamInfo(extension=".pdf", mimetype="audio/mpeg"), + ) + + call_kwargs = conv._client.begin_analyze_binary.call_args.kwargs + # Routed by extension: document modality → prebuilt-documentSearch + assert call_kwargs["analyzer_id"] == "prebuilt-documentSearch" + # content_type derived from file_type (PDF), not the conflicting mime + assert call_kwargs["content_type"] == "application/pdf" + def test_file_type_restriction_applies_to_mime(self): assert ( _detect_file_type( From 2ed5af7e703107b15786707d938c48d0fc9e521b Mon Sep 17 00:00:00 2001 From: chienyuanchang Date: Thu, 7 May 2026 17:02:14 -0700 Subject: [PATCH 9/9] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 12bbe3ad9..aa2f58bb8 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ See [`packages/markitdown-ocr/README.md`](packages/markitdown-ocr/README.md) for ### Azure Content Understanding -[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) is the next iteration of Azure Document Intelligence and is the recommended cloud option for new projects. It provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and [prebuilt](https://learn.microsoft.com/azure/ai-services/content-understanding/concepts/prebuilt-analyzers) or [custom-built](https://learn.microsoft.com/azure/ai-services/content-understanding/how-to/customize-analyzer-content-understanding-studio?tabs=portal) analyzers. +[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and configurable analyzers. Install: `pip install 'markitdown[az-content-understanding]'` @@ -170,7 +170,7 @@ Install: `pip install 'markitdown[az-content-understanding]'` Content Understanding is ideal when you need capabilities beyond what built-in or Document Intelligence converters provide: - **Audio and video files** — CU is the only option for video, and the higher-quality cloud option for audio. Built-in converters have no video support and only basic audio transcription. -- **Structured field extraction** — Custom analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields. +- **Structured field extraction** — [Prebuilt](https://learn.microsoft.com/azure/ai-services/content-understanding/concepts/prebuilt-analyzers) or [custom-built](https://learn.microsoft.com/azure/ai-services/content-understanding/how-to/customize-analyzer-content-understanding-studio?tabs=portal) analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields. - **Higher-quality document extraction** — Cloud-based layout analysis and OCR for scanned PDFs, complex tables, and multi-page documents. - **Single API for all modalities** — One `cu_endpoint` handles documents, images, audio, and video with automatic analyzer routing.