From bdcec69edf1bc96b1eaacc031af12d2b81646522 Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Wed, 6 May 2026 12:41:16 -0700
Subject: [PATCH 1/9] inital version

---
 README.md                                     |  60 +++
 packages/markitdown/pyproject.toml            |   2 +
 .../markitdown/src/markitdown/__main__.py     |  55 ++
 .../markitdown/src/markitdown/_markitdown.py  |  23 +
 .../src/markitdown/converters/__init__.py     |   6 +
 .../markitdown/converters/_cu_converter.py    | 485 ++++++++++++++++++
 .../markitdown/tests/test_cu_converter.py     | 329 ++++++++++++
 7 files changed, 960 insertions(+)
 create mode 100644 packages/markitdown/src/markitdown/converters/_cu_converter.py
 create mode 100644 packages/markitdown/tests/test_cu_converter.py

diff --git a/README.md b/README.md
index 71bcaa204..7d4936b36 100644
--- a/README.md
+++ b/README.md
@@ -168,6 +168,66 @@ markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoin
 
 More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
 
+### Azure Content Understanding
+
+[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and configurable analyzers.
+
+Install: `pip install 'markitdown[az-content-understanding]'`
+
+**CLI:**
+
+```bash
+markitdown path-to-file.pdf --use-cu --cu-endpoint "<content_understanding_endpoint>"
+```
+
+**Python API:**
+
+```python
+from markitdown import MarkItDown
+
+# Zero-config — auto-selects analyzer per file type
+md = MarkItDown(cu_endpoint="<content_understanding_endpoint>")
+result = md.convert("report.pdf")   # documents → prebuilt-documentSearch
+result = md.convert("meeting.mp4")  # video → prebuilt-videoSearch
+result = md.convert("call.wav")     # audio → prebuilt-audioSearch
+print(result.markdown)
+```
+
+**With a custom analyzer** (for domain-specific field extraction):
+
+```python
+md = MarkItDown(
+    cu_endpoint="<content_understanding_endpoint>",
+    cu_analyzer_id="my-invoice-analyzer",
+)
+result = md.convert("invoice.pdf")
+print(result.markdown)
+# Output includes YAML front matter with extracted fields:
+# ---
+# contentType: document
+# fields:
+#   VendorName: CONTOSO LTD.
+#   InvoiceDate: '2019-11-15'
+# ---
+# <!-- page 1 -->
+# ...
+```
+
+When `cu_analyzer_id` is set, the converter automatically scopes it to compatible file types based on the analyzer's modality. Incompatible types (e.g., audio files with a document analyzer) auto-route to default prebuilt analyzers.
+
+**Cost note:** Each `convert()` call for a CU-routed format is a billable Azure API call. Use `cu_file_types` to restrict which formats route to CU:
+
+```python
+from markitdown.converters import ContentUnderstandingFileType
+
+md = MarkItDown(
+    cu_endpoint="<content_understanding_endpoint>",
+    cu_file_types=[ContentUnderstandingFileType.PDF],  # only PDFs use CU
+)
+```
+
+More information about Azure Content Understanding can be found [here](https://learn.microsoft.com/azure/ai-services/content-understanding/).
+
 ### Python API
 
 Basic usage in Python:
diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
index ac3c8d947..84841cd03 100644
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -47,6 +47,7 @@ all = [
   "SpeechRecognition",
   "youtube-transcript-api~=1.0.0",
   "azure-ai-documentintelligence",
+  "azure-ai-contentunderstanding",
   "azure-identity",
 ]
 pptx = ["python-pptx"]
@@ -58,6 +59,7 @@ outlook = ["olefile"]
 audio-transcription = ["pydub", "SpeechRecognition"]
 youtube-transcription = ["youtube-transcript-api"]
 az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
+az-content-understanding = ["azure-ai-contentunderstanding", "azure-identity"]
 
 [project.urls]
 Documentation = "https://github.com/microsoft/markitdown#readme"
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index 6085ad6bb..d57b2ae65 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -4,6 +4,7 @@
 import argparse
 import sys
 import codecs
+from typing import Any, Dict
 from textwrap import dedent
 from importlib.metadata import entry_points
 from .__about__ import __version__
@@ -91,6 +92,32 @@ def main():
         help="Document Intelligence Endpoint. Required if using Document Intelligence.",
     )
 
+    parser.add_argument(
+        "--use-cu",
+        "--use-content-understanding",
+        action="store_true",
+        dest="use_cu",
+        help="Use Azure Content Understanding to extract text. Requires --cu-endpoint.",
+    )
+
+    parser.add_argument(
+        "--cu-endpoint",
+        type=str,
+        help="Content Understanding Endpoint. Required if using --use-cu.",
+    )
+
+    parser.add_argument(
+        "--cu-analyzer",
+        type=str,
+        help="Content Understanding analyzer ID. If not specified, auto-selects by file type.",
+    )
+
+    parser.add_argument(
+        "--cu-file-types",
+        type=str,
+        help="Comma-separated list of file types to route to Content Understanding (e.g., pdf,jpeg,mp4). If omitted, all supported types are routed.",
+    )
+
     parser.add_argument(
         "-p",
         "--use-plugins",
@@ -183,6 +210,34 @@ def main():
         markitdown = MarkItDown(
             enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
         )
+    elif args.use_cu:
+        if args.cu_endpoint is None:
+            _exit_with_error(
+                "Content Understanding Endpoint (--cu-endpoint) is required when using --use-cu."
+            )
+        elif args.filename is None:
+            _exit_with_error("Filename is required when using Content Understanding.")
+
+        cu_kwargs: Dict[str, Any] = {
+            "cu_endpoint": args.cu_endpoint,
+        }
+        if args.cu_analyzer is not None:
+            cu_kwargs["cu_analyzer_id"] = args.cu_analyzer
+        if args.cu_file_types is not None:
+            # Parse comma-separated file types into ContentUnderstandingFileType list
+            from .converters import ContentUnderstandingFileType
+
+            type_names = [t.strip().lower() for t in args.cu_file_types.split(",") if t.strip()]
+            cu_types = []
+            for name in type_names:
+                # Try matching by value (e.g., "pdf", "jpeg", "mp4")
+                try:
+                    cu_types.append(ContentUnderstandingFileType(name))
+                except ValueError:
+                    _exit_with_error(f"Unknown file type: {name}")
+            cu_kwargs["cu_file_types"] = cu_types
+
+        markitdown = MarkItDown(enable_plugins=args.use_plugins, **cu_kwargs)
     else:
         markitdown = MarkItDown(enable_plugins=args.use_plugins)
 
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f342a614b..f6aa4df0e 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -38,6 +38,7 @@
     ZipConverter,
     EpubConverter,
     DocumentIntelligenceConverter,
+    ContentUnderstandingConverter,
     CsvConverter,
 )
 
@@ -225,6 +226,28 @@ def enable_builtins(self, **kwargs) -> None:
                     DocumentIntelligenceConverter(**docintel_args),
                 )
 
+            # Register Content Understanding converter at the top of the stack if endpoint is provided
+            cu_endpoint = kwargs.get("cu_endpoint")
+            if cu_endpoint is not None:
+                cu_args: Dict[str, Any] = {}
+                cu_args["endpoint"] = cu_endpoint
+
+                cu_credential = kwargs.get("cu_credential")
+                if cu_credential is not None:
+                    cu_args["credential"] = cu_credential
+
+                cu_analyzer_id = kwargs.get("cu_analyzer_id")
+                if cu_analyzer_id is not None:
+                    cu_args["analyzer_id"] = cu_analyzer_id
+
+                cu_file_types = kwargs.get("cu_file_types")
+                if cu_file_types is not None:
+                    cu_args["file_types"] = cu_file_types
+
+                self.register_converter(
+                    ContentUnderstandingConverter(**cu_args),
+                )
+
             self._builtins_enabled = True
         else:
             warn("Built-in converters are already enabled.", RuntimeWarning)
diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
index e4437a582..77f8b1acd 100644
--- a/packages/markitdown/src/markitdown/converters/__init__.py
+++ b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -21,6 +21,10 @@
     DocumentIntelligenceConverter,
     DocumentIntelligenceFileType,
 )
+from ._cu_converter import (
+    ContentUnderstandingConverter,
+    ContentUnderstandingFileType,
+)
 from ._epub_converter import EpubConverter
 from ._csv_converter import CsvConverter
 
@@ -43,6 +47,8 @@
     "ZipConverter",
     "DocumentIntelligenceConverter",
     "DocumentIntelligenceFileType",
+    "ContentUnderstandingConverter",
+    "ContentUnderstandingFileType",
     "EpubConverter",
     "CsvConverter",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
new file mode 100644
index 000000000..23bf3ae62
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -0,0 +1,485 @@
+"""Azure Content Understanding converter for MarkItDown.
+
+Converts files using Azure Content Understanding (CU) for high-quality,
+multi-modal extraction with structured field output. Supports documents,
+images, audio, and video. Fields are serialized as YAML front matter via
+the CU SDK's ``to_llm_input()`` helper.
+
+Install dependencies: ``pip install markitdown[az-content-understanding]``
+"""
+
+import sys
+import os
+from typing import BinaryIO, Any, List, Optional, Dict
+from enum import Enum
+
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+from .._exceptions import MissingDependencyException
+
+# Try loading optional dependencies — save error for later
+_dependency_exc_info = None
+try:
+    from azure.ai.contentunderstanding import ContentUnderstandingClient, to_llm_input
+    from azure.core.credentials import AzureKeyCredential, TokenCredential
+    from azure.core.pipeline.policies import UserAgentPolicy
+    from azure.identity import DefaultAzureCredential
+except ImportError:
+    _dependency_exc_info = sys.exc_info()
+
+    # Stub classes for type hinting
+    class AzureKeyCredential:  # type: ignore[no-redef]
+        pass
+
+    class TokenCredential:  # type: ignore[no-redef]
+        pass
+
+    class ContentUnderstandingClient:  # type: ignore[no-redef]
+        pass
+
+    class UserAgentPolicy:  # type: ignore[no-redef]
+        pass
+
+    class DefaultAzureCredential:  # type: ignore[no-redef]
+        pass
+
+    def to_llm_input(*args, **kwargs):  # type: ignore[no-redef]
+        pass
+
+
+# ---------------------------------------------------------------------------
+# File type enum and routing tables
+# ---------------------------------------------------------------------------
+
+
+class ContentUnderstandingFileType(str, Enum):
+    """Supported file types for Content Understanding conversion."""
+
+    # Documents
+    PDF = "pdf"
+    DOCX = "docx"
+    PPTX = "pptx"
+    XLSX = "xlsx"
+    HTML = "html"
+    TXT = "txt"
+    MD = "md"
+    RTF = "rtf"
+    XML = "xml"
+
+    # Email
+    EML = "eml"
+    MSG = "msg"
+
+    # Images (document modality)
+    JPEG = "jpeg"
+    PNG = "png"
+    BMP = "bmp"
+    TIFF = "tiff"
+    HEIF = "heif"
+
+    # Video
+    MP4 = "mp4"
+    M4V = "m4v"
+    MOV = "mov"
+    AVI = "avi"
+    MKV = "mkv"
+    WEBM_VIDEO = "webm-video"
+    FLV = "flv"
+    WMV = "wmv"
+
+    # Audio
+    WAV = "wav"
+    MP3 = "mp3"
+    M4A = "m4a"
+    FLAC = "flac"
+    OGG = "ogg"
+    AAC = "aac"
+    WMA = "wma"
+
+
+# Extension → file type
+_EXTENSION_MAP: Dict[str, ContentUnderstandingFileType] = {
+    # Documents
+    ".pdf": ContentUnderstandingFileType.PDF,
+    ".docx": ContentUnderstandingFileType.DOCX,
+    ".pptx": ContentUnderstandingFileType.PPTX,
+    ".xlsx": ContentUnderstandingFileType.XLSX,
+    ".html": ContentUnderstandingFileType.HTML,
+    ".txt": ContentUnderstandingFileType.TXT,
+    ".md": ContentUnderstandingFileType.MD,
+    ".rtf": ContentUnderstandingFileType.RTF,
+    ".xml": ContentUnderstandingFileType.XML,
+    # Email
+    ".eml": ContentUnderstandingFileType.EML,
+    ".msg": ContentUnderstandingFileType.MSG,
+    # Images
+    ".jpg": ContentUnderstandingFileType.JPEG,
+    ".jpeg": ContentUnderstandingFileType.JPEG,
+    ".jpe": ContentUnderstandingFileType.JPEG,
+    ".png": ContentUnderstandingFileType.PNG,
+    ".bmp": ContentUnderstandingFileType.BMP,
+    ".tiff": ContentUnderstandingFileType.TIFF,
+    ".heif": ContentUnderstandingFileType.HEIF,
+    ".heic": ContentUnderstandingFileType.HEIF,
+    # Video
+    ".mp4": ContentUnderstandingFileType.MP4,
+    ".m4v": ContentUnderstandingFileType.M4V,
+    ".mov": ContentUnderstandingFileType.MOV,
+    ".avi": ContentUnderstandingFileType.AVI,
+    ".mkv": ContentUnderstandingFileType.MKV,
+    ".webm": ContentUnderstandingFileType.WEBM_VIDEO,
+    ".flv": ContentUnderstandingFileType.FLV,
+    ".wmv": ContentUnderstandingFileType.WMV,
+    # Audio
+    ".wav": ContentUnderstandingFileType.WAV,
+    ".mp3": ContentUnderstandingFileType.MP3,
+    ".m4a": ContentUnderstandingFileType.M4A,
+    ".flac": ContentUnderstandingFileType.FLAC,
+    ".ogg": ContentUnderstandingFileType.OGG,
+    ".aac": ContentUnderstandingFileType.AAC,
+    ".wma": ContentUnderstandingFileType.WMA,
+}
+
+# MIME type prefixes for each file type
+_MIME_PREFIXES: Dict[ContentUnderstandingFileType, List[str]] = {
+    # Documents
+    ContentUnderstandingFileType.PDF: ["application/pdf", "application/x-pdf"],
+    ContentUnderstandingFileType.DOCX: [
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    ],
+    ContentUnderstandingFileType.PPTX: [
+        "application/vnd.openxmlformats-officedocument.presentationml"
+    ],
+    ContentUnderstandingFileType.XLSX: [
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    ],
+    ContentUnderstandingFileType.HTML: ["text/html", "application/xhtml+xml"],
+    ContentUnderstandingFileType.TXT: ["text/plain"],
+    ContentUnderstandingFileType.MD: ["text/markdown"],
+    ContentUnderstandingFileType.RTF: ["text/rtf", "application/rtf"],
+    ContentUnderstandingFileType.XML: ["text/xml", "application/xml"],
+    # Email
+    ContentUnderstandingFileType.EML: ["message/rfc822"],
+    ContentUnderstandingFileType.MSG: ["application/vnd.ms-outlook"],
+    # Images
+    ContentUnderstandingFileType.JPEG: ["image/jpeg"],
+    ContentUnderstandingFileType.PNG: ["image/png"],
+    ContentUnderstandingFileType.BMP: ["image/bmp"],
+    ContentUnderstandingFileType.TIFF: ["image/tiff"],
+    ContentUnderstandingFileType.HEIF: ["image/heif", "image/heic"],
+    # Video
+    ContentUnderstandingFileType.MP4: ["video/mp4"],
+    ContentUnderstandingFileType.M4V: ["video/x-m4v"],
+    ContentUnderstandingFileType.MOV: ["video/quicktime"],
+    ContentUnderstandingFileType.AVI: ["video/x-msvideo"],
+    ContentUnderstandingFileType.MKV: ["video/x-matroska"],
+    ContentUnderstandingFileType.WEBM_VIDEO: ["video/webm"],
+    ContentUnderstandingFileType.FLV: ["video/x-flv"],
+    ContentUnderstandingFileType.WMV: ["video/x-ms-wmv"],
+    # Audio
+    ContentUnderstandingFileType.WAV: ["audio/wav", "audio/x-wav"],
+    ContentUnderstandingFileType.MP3: ["audio/mpeg", "audio/mp3"],
+    ContentUnderstandingFileType.M4A: ["audio/mp4", "audio/m4a"],
+    ContentUnderstandingFileType.FLAC: ["audio/flac", "audio/x-flac"],
+    ContentUnderstandingFileType.OGG: ["audio/ogg"],
+    ContentUnderstandingFileType.AAC: ["audio/aac"],
+    ContentUnderstandingFileType.WMA: ["audio/x-ms-wma"],
+}
+
+# File type → modality category
+_DOCUMENT_TYPES = {
+    ContentUnderstandingFileType.PDF,
+    ContentUnderstandingFileType.DOCX,
+    ContentUnderstandingFileType.PPTX,
+    ContentUnderstandingFileType.XLSX,
+    ContentUnderstandingFileType.HTML,
+    ContentUnderstandingFileType.TXT,
+    ContentUnderstandingFileType.MD,
+    ContentUnderstandingFileType.RTF,
+    ContentUnderstandingFileType.XML,
+    ContentUnderstandingFileType.EML,
+    ContentUnderstandingFileType.MSG,
+    ContentUnderstandingFileType.JPEG,
+    ContentUnderstandingFileType.PNG,
+    ContentUnderstandingFileType.BMP,
+    ContentUnderstandingFileType.TIFF,
+    ContentUnderstandingFileType.HEIF,
+}
+
+_VIDEO_TYPES = {
+    ContentUnderstandingFileType.MP4,
+    ContentUnderstandingFileType.M4V,
+    ContentUnderstandingFileType.MOV,
+    ContentUnderstandingFileType.AVI,
+    ContentUnderstandingFileType.MKV,
+    ContentUnderstandingFileType.WEBM_VIDEO,
+    ContentUnderstandingFileType.FLV,
+    ContentUnderstandingFileType.WMV,
+}
+
+_AUDIO_TYPES = {
+    ContentUnderstandingFileType.WAV,
+    ContentUnderstandingFileType.MP3,
+    ContentUnderstandingFileType.M4A,
+    ContentUnderstandingFileType.FLAC,
+    ContentUnderstandingFileType.OGG,
+    ContentUnderstandingFileType.AAC,
+    ContentUnderstandingFileType.WMA,
+}
+
+_DEFAULT_ANALYZERS = {
+    "document": "prebuilt-documentSearch",
+    "video": "prebuilt-videoSearch",
+    "audio": "prebuilt-audioSearch",
+}
+
+# All supported file types (default set when file_types is None)
+_ALL_FILE_TYPES = list(ContentUnderstandingFileType)
+
+
+def _get_modality(file_type: ContentUnderstandingFileType) -> str:
+    """Get the modality category for a file type."""
+    if file_type in _DOCUMENT_TYPES:
+        return "document"
+    elif file_type in _VIDEO_TYPES:
+        return "video"
+    elif file_type in _AUDIO_TYPES:
+        return "audio"
+    raise ValueError(f"Unknown file type: {file_type}")
+
+
+# ---------------------------------------------------------------------------
+# Smart routing: base_analyzer_id → modality mapping
+# ---------------------------------------------------------------------------
+
+_BASE_TO_MODALITY: Dict[str, str] = {
+    "prebuilt-document": "document",
+    "prebuilt-image": "document",  # CU images return kind="document"
+    "prebuilt-audio": "audio",
+    "prebuilt-video": "video",
+}
+
+# For prebuilt analyzers, infer modality from name without an API call
+_PREBUILT_MODALITY: Dict[str, str] = {
+    # Document-based prebuilts
+    "prebuilt-documentSearch": "document",
+    "prebuilt-layout": "document",
+    "prebuilt-read": "document",
+    "prebuilt-document": "document",
+    "prebuilt-invoice": "document",
+    "prebuilt-receipt": "document",
+    "prebuilt-receipt.generic": "document",
+    "prebuilt-receipt.hotel": "document",
+    "prebuilt-idDocument": "document",
+    "prebuilt-idDocument.generic": "document",
+    "prebuilt-idDocument.passport": "document",
+    "prebuilt-healthInsuranceCard.us": "document",
+    "prebuilt-contract": "document",
+    "prebuilt-creditCard": "document",
+    "prebuilt-creditMemo": "document",
+    "prebuilt-bankStatement.us": "document",
+    "prebuilt-check.us": "document",
+    "prebuilt-purchaseOrder": "document",
+    "prebuilt-procurement": "document",
+    "prebuilt-payStub.us": "document",
+    "prebuilt-utilityBill": "document",
+    "prebuilt-marriageCertificate.us": "document",
+    "prebuilt-documentFieldSchema": "document",
+    "prebuilt-documentFields": "document",
+    # Tax prebuilts (all document-based)
+    "prebuilt-tax.us": "document",
+    "prebuilt-tax.us.w2": "document",
+    "prebuilt-tax.us.w4": "document",
+    "prebuilt-tax.us.1040": "document",
+    # Mortgage prebuilts
+    "prebuilt-mortgage.us": "document",
+    "prebuilt-mortgage.us.1003": "document",
+    "prebuilt-mortgage.us.closingDisclosure": "document",
+    # Image-based prebuilts
+    "prebuilt-image": "document",  # images are document modality in CU
+    "prebuilt-imageSearch": "document",
+    # Audio-based prebuilts
+    "prebuilt-audio": "audio",
+    "prebuilt-audioSearch": "audio",
+    "prebuilt-callCenter": "audio",
+    # Video-based prebuilts
+    "prebuilt-video": "video",
+    "prebuilt-videoSearch": "video",
+    "prebuilt-videoSynopsis": "video",
+}
+
+
+def _infer_prebuilt_modality(analyzer_id: str) -> str:
+    """Infer modality from a prebuilt analyzer ID without an API call."""
+    if analyzer_id in _PREBUILT_MODALITY:
+        return _PREBUILT_MODALITY[analyzer_id]
+    # Unknown prebuilt — most prebuilts are document-based
+    return "document"
+
+
+# ---------------------------------------------------------------------------
+# Converter
+# ---------------------------------------------------------------------------
+
+
+class ContentUnderstandingConverter(DocumentConverter):
+    """Converts files using Azure Content Understanding.
+
+    Provides high-quality document, image, audio, and video conversion
+    with structured field extraction via YAML front matter.
+    """
+
+    def __init__(
+        self,
+        *,
+        endpoint: str,
+        credential: AzureKeyCredential | TokenCredential | None = None,
+        analyzer_id: Optional[str] = None,
+        file_types: Optional[List[ContentUnderstandingFileType]] = None,
+    ):
+        """Initialize the Content Understanding converter.
+
+        Args:
+            endpoint: CU resource endpoint URL.
+            credential: Explicit credential. If None, falls back to
+                AZURE_API_KEY env var, then DefaultAzureCredential.
+            analyzer_id: Custom analyzer for compatible file types.
+                When set, the converter checks the analyzer's base modality
+                (via get_analyzer() at init) and routes only compatible
+                file types to it. Incompatible modalities auto-route to
+                default prebuilts. If None, auto-selects by extension/MIME.
+            file_types: Which file types to handle. If None, uses the
+                default set (all supported formats).
+        """
+        super().__init__()
+
+        # Raise if dependencies are missing
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                "ContentUnderstandingConverter requires the optional dependency "
+                "[az-content-understanding] (or [all]) to be installed. "
+                "E.g., `pip install markitdown[az-content-understanding]`"
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
+                _dependency_exc_info[2]
+            )
+
+        self._file_types = file_types if file_types is not None else _ALL_FILE_TYPES
+        self._analyzer_id = analyzer_id
+        self._analyzer_modality: Optional[str] = None
+
+        # Resolve credential
+        if credential is None:
+            api_key = os.environ.get("AZURE_API_KEY")
+            if api_key is not None:
+                credential = AzureKeyCredential(api_key)
+            else:
+                credential = DefaultAzureCredential()
+
+        # Build file type lookup sets
+        self._accepted_extensions = set()
+        self._accepted_mime_prefixes: List[str] = []
+        for ft in self._file_types:
+            # Extensions
+            for ext, mapped_ft in _EXTENSION_MAP.items():
+                if mapped_ft == ft:
+                    self._accepted_extensions.add(ext)
+            # MIME prefixes
+            if ft in _MIME_PREFIXES:
+                self._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft])
+
+        # User agent for telemetry
+        try:
+            from ..__about__ import __version__
+        except ImportError:
+            __version__ = "unknown"
+        user_agent = f"markitdown-cu/{__version__}"
+
+        # Create CU client
+        self._client = ContentUnderstandingClient(
+            endpoint=endpoint,
+            credential=credential,
+            user_agent_policy=UserAgentPolicy(user_agent=user_agent),
+        )
+
+        # Smart routing: resolve analyzer modality at init
+        if self._analyzer_id is not None:
+            if self._analyzer_id.startswith("prebuilt-"):
+                # Infer from name — no API call
+                self._analyzer_modality = _infer_prebuilt_modality(self._analyzer_id)
+            else:
+                # Custom analyzer — one get_analyzer() call, cached
+                analyzer_info = self._client.get_analyzer(self._analyzer_id)
+                if analyzer_info.base_analyzer_id:
+                    self._analyzer_modality = _BASE_TO_MODALITY.get(
+                        analyzer_info.base_analyzer_id, "document"
+                    )
+                else:
+                    self._analyzer_modality = "document"
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        """Return True if the file type is in the configured set."""
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in self._accepted_extensions:
+            return True
+
+        for prefix in self._accepted_mime_prefixes:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        """Convert the file using CU and return Markdown with YAML front matter."""
+
+        # 1. Determine analyzer_id (smart routing: check modality)
+        extension = (stream_info.extension or "").lower()
+        file_type = _EXTENSION_MAP.get(extension)
+
+        if file_type is not None:
+            file_modality = _get_modality(file_type)
+        else:
+            # Fallback: try MIME type
+            file_modality = "document"
+
+        if (
+            self._analyzer_id is not None
+            and self._analyzer_modality is not None
+            and file_modality == self._analyzer_modality
+        ):
+            analyzer_id = self._analyzer_id
+        else:
+            analyzer_id = _DEFAULT_ANALYZERS.get(file_modality, "prebuilt-documentSearch")
+
+        # 2. Read file bytes and determine MIME type
+        file_bytes = file_stream.read()
+        content_type = stream_info.mimetype or "application/octet-stream"
+
+        # 3. Call CU SDK
+        poller = self._client.begin_analyze_binary(
+            analyzer_id=analyzer_id,
+            binary_input=file_bytes,
+            content_type=content_type,
+        )
+
+        # 4. Block on result
+        result = poller.result()
+
+        # 5. Format output using to_llm_input()
+        text = to_llm_input(result)
+
+        # 6. Return
+        return DocumentConverterResult(markdown=text)
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
new file mode 100644
index 000000000..2a9fba773
--- /dev/null
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -0,0 +1,329 @@
+"""Tests for ContentUnderstandingConverter.
+
+Tests accepts() routing, smart routing modality logic, and convert() via mocks.
+Follows the same pattern as test_docintel_html.py.
+"""
+
+import io
+from unittest.mock import MagicMock, patch, PropertyMock
+
+import pytest
+
+from markitdown.converters._cu_converter import (
+    ContentUnderstandingConverter,
+    ContentUnderstandingFileType,
+    _infer_prebuilt_modality,
+    _get_modality,
+    _EXTENSION_MAP,
+)
+from markitdown._stream_info import StreamInfo
+
+
+# ---------------------------------------------------------------------------
+# Helper: create a converter with accepts() working but no SDK init
+# ---------------------------------------------------------------------------
+
+def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None):
+    """Create a converter bypassing __init__ (no SDK deps needed)."""
+    conv = ContentUnderstandingConverter.__new__(ContentUnderstandingConverter)
+    conv._analyzer_id = analyzer_id
+    conv._analyzer_modality = analyzer_modality
+
+    # Build accepted extensions/mime from file_types
+    from markitdown.converters._cu_converter import (
+        _ALL_FILE_TYPES,
+        _MIME_PREFIXES,
+    )
+
+    types = file_types if file_types is not None else _ALL_FILE_TYPES
+    conv._file_types = types
+
+    conv._accepted_extensions = set()
+    conv._accepted_mime_prefixes = []
+    for ft in types:
+        for ext, mapped_ft in _EXTENSION_MAP.items():
+            if mapped_ft == ft:
+                conv._accepted_extensions.add(ext)
+        if ft in _MIME_PREFIXES:
+            conv._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft])
+
+    return conv
+
+
+# ---------------------------------------------------------------------------
+# accepts() tests — extension-based
+# ---------------------------------------------------------------------------
+
+class TestAcceptsExtension:
+    """Test accepts() for supported and unsupported file extensions."""
+
+    @pytest.mark.parametrize("ext", [
+        ".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".md", ".rtf", ".xml",
+        ".eml", ".msg",
+        ".jpg", ".jpeg", ".jpe", ".png", ".bmp", ".tiff", ".heif", ".heic",
+        ".mp4", ".m4v", ".mov", ".avi", ".mkv", ".webm", ".flv", ".wmv",
+        ".wav", ".mp3", ".m4a", ".flac", ".ogg", ".aac", ".wma",
+    ])
+    def test_accepts_supported_extensions(self, ext):
+        conv = _make_converter()
+        assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext))
+
+    @pytest.mark.parametrize("ext", [".csv", ".json", ".zip", ".epub", ".py", ".rs"])
+    def test_rejects_unsupported_extensions(self, ext):
+        conv = _make_converter()
+        assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext))
+
+
+# ---------------------------------------------------------------------------
+# accepts() tests — MIME-based
+# ---------------------------------------------------------------------------
+
+class TestAcceptsMime:
+    """Test accepts() for MIME type matching."""
+
+    @pytest.mark.parametrize("mime", [
+        "application/pdf",
+        "image/jpeg",
+        "video/mp4",
+        "audio/wav",
+        "text/html",
+        "audio/mpeg",
+        "video/quicktime",
+    ])
+    def test_accepts_supported_mimetypes(self, mime):
+        conv = _make_converter()
+        assert conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime))
+
+    @pytest.mark.parametrize("mime", [
+        "text/csv",
+        "application/json",
+        "application/zip",
+    ])
+    def test_rejects_unsupported_mimetypes(self, mime):
+        conv = _make_converter()
+        assert not conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime))
+
+
+# ---------------------------------------------------------------------------
+# accepts() tests — cu_file_types restriction
+# ---------------------------------------------------------------------------
+
+class TestAcceptsFileTypeRestriction:
+    """Test that cu_file_types restricts which formats are accepted."""
+
+    def test_restricted_to_pdf_only(self):
+        conv = _make_converter(file_types=[ContentUnderstandingFileType.PDF])
+        assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))
+        assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp4"))
+        assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".wav"))
+        assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".jpg"))
+
+    def test_restricted_to_audio(self):
+        conv = _make_converter(file_types=[
+            ContentUnderstandingFileType.WAV,
+            ContentUnderstandingFileType.MP3,
+        ])
+        assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".wav"))
+        assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp3"))
+        assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))
+
+
+# ---------------------------------------------------------------------------
+# Smart routing tests
+# ---------------------------------------------------------------------------
+
+class TestSmartRouting:
+    """Test modality-aware analyzer routing."""
+
+    def test_document_analyzer_routes_pdf_to_custom(self):
+        """Document-based analyzer should be used for PDF."""
+        conv = _make_converter(
+            analyzer_id="my-doc-analyzer",
+            analyzer_modality="document",
+        )
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+
+        # Should use the custom analyzer for PDF (document modality)
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "my-doc-analyzer"
+
+    def test_document_analyzer_routes_mp3_to_prebuilt(self):
+        """Document-based analyzer should auto-route MP3 to prebuilt-audioSearch."""
+        conv = _make_converter(
+            analyzer_id="my-doc-analyzer",
+            analyzer_modality="document",
+        )
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(io.BytesIO(b"fake audio"), StreamInfo(extension=".mp3", mimetype="audio/mpeg"))
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "prebuilt-audioSearch"
+
+    def test_document_analyzer_routes_mp4_to_prebuilt(self):
+        """Document-based analyzer should auto-route MP4 to prebuilt-videoSearch."""
+        conv = _make_converter(
+            analyzer_id="my-doc-analyzer",
+            analyzer_modality="document",
+        )
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(io.BytesIO(b"fake video"), StreamInfo(extension=".mp4", mimetype="video/mp4"))
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "prebuilt-videoSearch"
+
+    def test_no_analyzer_id_uses_auto_routing(self):
+        """Without analyzer_id, PDF should auto-route to prebuilt-documentSearch."""
+        conv = _make_converter(analyzer_id=None, analyzer_modality=None)
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch"
+
+
+# ---------------------------------------------------------------------------
+# _infer_prebuilt_modality tests
+# ---------------------------------------------------------------------------
+
+class TestInferPrebuiltModality:
+    """Test modality inference from prebuilt analyzer names."""
+
+    def test_document_prebuilts(self):
+        assert _infer_prebuilt_modality("prebuilt-documentSearch") == "document"
+        assert _infer_prebuilt_modality("prebuilt-invoice") == "document"
+        assert _infer_prebuilt_modality("prebuilt-layout") == "document"
+        assert _infer_prebuilt_modality("prebuilt-receipt") == "document"
+        assert _infer_prebuilt_modality("prebuilt-tax.us.w2") == "document"
+
+    def test_audio_prebuilts(self):
+        assert _infer_prebuilt_modality("prebuilt-audioSearch") == "audio"
+        assert _infer_prebuilt_modality("prebuilt-callCenter") == "audio"
+
+    def test_video_prebuilts(self):
+        assert _infer_prebuilt_modality("prebuilt-videoSearch") == "video"
+        assert _infer_prebuilt_modality("prebuilt-videoSynopsis") == "video"
+
+    def test_image_prebuilts_map_to_document(self):
+        assert _infer_prebuilt_modality("prebuilt-imageSearch") == "document"
+        assert _infer_prebuilt_modality("prebuilt-image") == "document"
+
+    def test_unknown_prebuilt_defaults_to_document(self):
+        assert _infer_prebuilt_modality("prebuilt-unknownNewAnalyzer") == "document"
+
+
+# ---------------------------------------------------------------------------
+# _get_modality tests
+# ---------------------------------------------------------------------------
+
+class TestGetModality:
+    """Test file type → modality mapping."""
+
+    def test_document_types(self):
+        assert _get_modality(ContentUnderstandingFileType.PDF) == "document"
+        assert _get_modality(ContentUnderstandingFileType.DOCX) == "document"
+        assert _get_modality(ContentUnderstandingFileType.JPEG) == "document"
+
+    def test_video_types(self):
+        assert _get_modality(ContentUnderstandingFileType.MP4) == "video"
+        assert _get_modality(ContentUnderstandingFileType.MOV) == "video"
+
+    def test_audio_types(self):
+        assert _get_modality(ContentUnderstandingFileType.WAV) == "audio"
+        assert _get_modality(ContentUnderstandingFileType.MP3) == "audio"
+
+
+# ---------------------------------------------------------------------------
+# convert() mock tests
+# ---------------------------------------------------------------------------
+
+class TestConvertMock:
+    """Test convert() with mocked CU SDK."""
+
+    def _run_convert(self, extension, mimetype, expected_output="mock output"):
+        conv = _make_converter()
+        conv._client = MagicMock()
+
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch(
+            "markitdown.converters._cu_converter.to_llm_input",
+            return_value=expected_output,
+        ):
+            result = conv.convert(
+                io.BytesIO(b"fake content"),
+                StreamInfo(extension=extension, mimetype=mimetype),
+            )
+        return result
+
+    def test_pdf_returns_markdown(self):
+        result = self._run_convert(".pdf", "application/pdf", "---\ncontentType: document\n---\n# Test")
+        assert "contentType: document" in result.markdown
+
+    def test_mp4_returns_markdown(self):
+        result = self._run_convert(".mp4", "video/mp4", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hello")
+        assert "contentType: audioVisual" in result.markdown
+
+    def test_wav_returns_markdown(self):
+        result = self._run_convert(".wav", "audio/wav", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hi")
+        assert "audioVisual" in result.markdown
+
+    def test_empty_result(self):
+        result = self._run_convert(".pdf", "application/pdf", "")
+        assert result.markdown == ""
+
+
+# ---------------------------------------------------------------------------
+# MissingDependencyException test
+# ---------------------------------------------------------------------------
+
+class TestMissingDependency:
+    """Test that MissingDependencyException is raised when CU SDK is not installed."""
+
+    def test_missing_deps_message(self):
+        """Verify the exception includes install hint."""
+        # We can't easily simulate ImportError in the module, but we can check
+        # the exception message pattern if it were raised.
+        from markitdown._exceptions import MissingDependencyException
+
+        exc = MissingDependencyException(
+            "ContentUnderstandingConverter requires the optional dependency "
+            "[az-content-understanding] (or [all]) to be installed."
+        )
+        assert "az-content-understanding" in str(exc)

From 1c70a82f937be6c3568f7115710fad1b7fd65359 Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Wed, 6 May 2026 14:24:50 -0700
Subject: [PATCH 2/9] improve mime type detection

---
 README.md                                     |   9 +
 packages/markitdown/pyproject.toml            |   4 +-
 .../markitdown/converters/_cu_converter.py    | 111 +++++++----
 .../markitdown/tests/test_cu_converter.py     | 184 ++++++++++++++++--
 4 files changed, 249 insertions(+), 59 deletions(-)

diff --git a/README.md b/README.md
index 7d4936b36..005d7bdc0 100644
--- a/README.md
+++ b/README.md
@@ -107,6 +107,7 @@ At the moment, the following optional dependencies are available:
 * `[pdf]` Installs dependencies for PDF files
 * `[outlook]` Installs dependencies for Outlook messages
 * `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
+* `[az-content-understanding]` Installs dependencies for Azure Content Understanding
 * `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
 * `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
 
@@ -174,6 +175,14 @@ More information about how to set up an Azure Document Intelligence Resource can
 
 Install: `pip install 'markitdown[az-content-understanding]'`
 
+| Capability | Built-in converters | Azure Document Intelligence | Azure Content Understanding |
+|------------|---------------------|-----------------------------|-----------------------------|
+| Document conversion | Offline, format-specific extraction | Cloud layout extraction | Cloud multimodal extraction |
+| Structured fields | Not available | Not exposed by this integration | YAML front matter from analyzer fields |
+| Custom analyzers | Not available | Not configurable in this integration | Supported with `cu_analyzer_id` |
+| Audio and video | Basic audio, no video | Not supported | Audio and video analyzers |
+| Cost | Local compute only | Billable Azure API calls | Billable Azure API calls |
+
 **CLI:**
 
 ```bash
diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
index 84841cd03..8366c0754 100644
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -47,7 +47,7 @@ all = [
   "SpeechRecognition",
   "youtube-transcript-api~=1.0.0",
   "azure-ai-documentintelligence",
-  "azure-ai-contentunderstanding",
+  "azure-ai-contentunderstanding>=1.2.0b1",
   "azure-identity",
 ]
 pptx = ["python-pptx"]
@@ -59,7 +59,7 @@ outlook = ["olefile"]
 audio-transcription = ["pydub", "SpeechRecognition"]
 youtube-transcription = ["youtube-transcript-api"]
 az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
-az-content-understanding = ["azure-ai-contentunderstanding", "azure-identity"]
+az-content-understanding = ["azure-ai-contentunderstanding>=1.2.0b1", "azure-identity"]
 
 [project.urls]
 Documentation = "https://github.com/microsoft/markitdown#readme"
diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
index 23bf3ae62..b2f6c80ba 100644
--- a/packages/markitdown/src/markitdown/converters/_cu_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -83,7 +83,7 @@ class ContentUnderstandingFileType(str, Enum):
     MOV = "mov"
     AVI = "avi"
     MKV = "mkv"
-    WEBM_VIDEO = "webm-video"
+    WEBM = "webm"
     FLV = "flv"
     WMV = "wmv"
 
@@ -127,7 +127,7 @@ class ContentUnderstandingFileType(str, Enum):
     ".mov": ContentUnderstandingFileType.MOV,
     ".avi": ContentUnderstandingFileType.AVI,
     ".mkv": ContentUnderstandingFileType.MKV,
-    ".webm": ContentUnderstandingFileType.WEBM_VIDEO,
+    ".webm": ContentUnderstandingFileType.WEBM,
     ".flv": ContentUnderstandingFileType.FLV,
     ".wmv": ContentUnderstandingFileType.WMV,
     # Audio
@@ -173,19 +173,26 @@ class ContentUnderstandingFileType(str, Enum):
     ContentUnderstandingFileType.MOV: ["video/quicktime"],
     ContentUnderstandingFileType.AVI: ["video/x-msvideo"],
     ContentUnderstandingFileType.MKV: ["video/x-matroska"],
-    ContentUnderstandingFileType.WEBM_VIDEO: ["video/webm"],
+    ContentUnderstandingFileType.WEBM: ["video/webm"],
     ContentUnderstandingFileType.FLV: ["video/x-flv"],
     ContentUnderstandingFileType.WMV: ["video/x-ms-wmv"],
     # Audio
     ContentUnderstandingFileType.WAV: ["audio/wav", "audio/x-wav"],
     ContentUnderstandingFileType.MP3: ["audio/mpeg", "audio/mp3"],
-    ContentUnderstandingFileType.M4A: ["audio/mp4", "audio/m4a"],
+    ContentUnderstandingFileType.M4A: ["audio/mp4", "audio/m4a", "audio/x-m4a"],
     ContentUnderstandingFileType.FLAC: ["audio/flac", "audio/x-flac"],
     ContentUnderstandingFileType.OGG: ["audio/ogg"],
     ContentUnderstandingFileType.AAC: ["audio/aac"],
     ContentUnderstandingFileType.WMA: ["audio/x-ms-wma"],
 }
 
+_MIME_ALIASES: Dict[str, str] = {
+    "audio/x-wav": "audio/wav",
+    "audio/x-flac": "audio/flac",
+    "audio/x-m4a": "audio/mp4",
+    "video/x-m4v": "video/mp4",
+}
+
 # File type → modality category
 _DOCUMENT_TYPES = {
     ContentUnderstandingFileType.PDF,
@@ -212,7 +219,7 @@ class ContentUnderstandingFileType(str, Enum):
     ContentUnderstandingFileType.MOV,
     ContentUnderstandingFileType.AVI,
     ContentUnderstandingFileType.MKV,
-    ContentUnderstandingFileType.WEBM_VIDEO,
+    ContentUnderstandingFileType.WEBM,
     ContentUnderstandingFileType.FLV,
     ContentUnderstandingFileType.WMV,
 }
@@ -248,6 +255,62 @@ def _get_modality(file_type: ContentUnderstandingFileType) -> str:
     raise ValueError(f"Unknown file type: {file_type}")
 
 
+def _detect_file_type(
+    stream_info: StreamInfo,
+    file_types: Optional[List[ContentUnderstandingFileType]] = None,
+) -> Optional[ContentUnderstandingFileType]:
+    """Detect a supported CU file type from extension or MIME type."""
+    allowed = set(file_types) if file_types is not None else None
+
+    extension = (stream_info.extension or "").lower()
+    file_type = _EXTENSION_MAP.get(extension)
+    if file_type is not None and (allowed is None or file_type in allowed):
+        return file_type
+
+    mimetype = _clean_mime_type(stream_info.mimetype)
+    if not mimetype:
+        return None
+
+    return _detect_file_type_from_mime(mimetype, allowed)
+
+
+def _clean_mime_type(mimetype: Optional[str]) -> str:
+    return (mimetype or "").split(";", 1)[0].strip().lower()
+
+
+def _canonical_mime_type(mimetype: Optional[str]) -> str:
+    cleaned = _clean_mime_type(mimetype)
+    return _MIME_ALIASES.get(cleaned, cleaned) or "application/octet-stream"
+
+
+def _content_type_for(
+    file_type: ContentUnderstandingFileType,
+    mimetype: Optional[str],
+) -> str:
+    content_type = _canonical_mime_type(mimetype)
+    if content_type != "application/octet-stream":
+        return content_type
+
+    prefixes = _MIME_PREFIXES.get(file_type, [])
+    if not prefixes:
+        return content_type
+
+    return _canonical_mime_type(prefixes[0])
+
+
+def _detect_file_type_from_mime(
+    mimetype: str,
+    allowed: Optional[set[ContentUnderstandingFileType]],
+) -> Optional[ContentUnderstandingFileType]:
+    for candidate, prefixes in _MIME_PREFIXES.items():
+        if allowed is not None and candidate not in allowed:
+            continue
+        for prefix in prefixes:
+            if mimetype.startswith(prefix):
+                return candidate
+    return None
+
+
 # ---------------------------------------------------------------------------
 # Smart routing: base_analyzer_id → modality mapping
 # ---------------------------------------------------------------------------
@@ -377,18 +440,6 @@ def __init__(
             else:
                 credential = DefaultAzureCredential()
 
-        # Build file type lookup sets
-        self._accepted_extensions = set()
-        self._accepted_mime_prefixes: List[str] = []
-        for ft in self._file_types:
-            # Extensions
-            for ext, mapped_ft in _EXTENSION_MAP.items():
-                if mapped_ft == ft:
-                    self._accepted_extensions.add(ext)
-            # MIME prefixes
-            if ft in _MIME_PREFIXES:
-                self._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft])
-
         # User agent for telemetry
         try:
             from ..__about__ import __version__
@@ -425,17 +476,7 @@ def accepts(
         **kwargs: Any,
     ) -> bool:
         """Return True if the file type is in the configured set."""
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in self._accepted_extensions:
-            return True
-
-        for prefix in self._accepted_mime_prefixes:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+        return _detect_file_type(stream_info, self._file_types) is not None
 
     def convert(
         self,
@@ -446,14 +487,10 @@ def convert(
         """Convert the file using CU and return Markdown with YAML front matter."""
 
         # 1. Determine analyzer_id (smart routing: check modality)
-        extension = (stream_info.extension or "").lower()
-        file_type = _EXTENSION_MAP.get(extension)
-
-        if file_type is not None:
-            file_modality = _get_modality(file_type)
-        else:
-            # Fallback: try MIME type
-            file_modality = "document"
+        file_type = _detect_file_type(stream_info, self._file_types)
+        if file_type is None:
+            raise ValueError("Unsupported file type for Content Understanding conversion.")
+        file_modality = _get_modality(file_type)
 
         if (
             self._analyzer_id is not None
@@ -466,7 +503,7 @@ def convert(
 
         # 2. Read file bytes and determine MIME type
         file_bytes = file_stream.read()
-        content_type = stream_info.mimetype or "application/octet-stream"
+        content_type = _content_type_for(file_type, stream_info.mimetype)
 
         # 3. Call CU SDK
         poller = self._client.begin_analyze_binary(
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
index 2a9fba773..b5e87ba34 100644
--- a/packages/markitdown/tests/test_cu_converter.py
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -5,7 +5,7 @@
 """
 
 import io
-from unittest.mock import MagicMock, patch, PropertyMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -14,6 +14,9 @@
     ContentUnderstandingFileType,
     _infer_prebuilt_modality,
     _get_modality,
+    _detect_file_type,
+    _canonical_mime_type,
+    _content_type_for,
     _EXTENSION_MAP,
 )
 from markitdown._stream_info import StreamInfo
@@ -29,24 +32,14 @@ def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None):
     conv._analyzer_id = analyzer_id
     conv._analyzer_modality = analyzer_modality
 
-    # Build accepted extensions/mime from file_types
+    # Set accepted file types without running SDK-dependent initialization.
     from markitdown.converters._cu_converter import (
         _ALL_FILE_TYPES,
-        _MIME_PREFIXES,
     )
 
     types = file_types if file_types is not None else _ALL_FILE_TYPES
     conv._file_types = types
 
-    conv._accepted_extensions = set()
-    conv._accepted_mime_prefixes = []
-    for ft in types:
-        for ext, mapped_ft in _EXTENSION_MAP.items():
-            if mapped_ft == ft:
-                conv._accepted_extensions.add(ext)
-        if ft in _MIME_PREFIXES:
-            conv._accepted_mime_prefixes.extend(_MIME_PREFIXES[ft])
-
     return conv
 
 
@@ -68,7 +61,9 @@ def test_accepts_supported_extensions(self, ext):
         conv = _make_converter()
         assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext))
 
-    @pytest.mark.parametrize("ext", [".csv", ".json", ".zip", ".epub", ".py", ".rs"])
+    @pytest.mark.parametrize("ext", [
+        ".csv", ".json", ".zip", ".epub", ".py", ".rs",
+    ])
     def test_rejects_unsupported_extensions(self, ext):
         conv = _make_converter()
         assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext))
@@ -86,9 +81,18 @@ class TestAcceptsMime:
         "image/jpeg",
         "video/mp4",
         "audio/wav",
+        "audio/x-wav",
         "text/html",
         "audio/mpeg",
+        "audio/x-m4a",
+        "audio/x-flac",
         "video/quicktime",
+        "video/webm",
+        "video/x-m4v",
+        "video/x-flv",
+        "video/x-ms-wmv",
+        "audio/aac",
+        "audio/x-ms-wma",
     ])
     def test_accepts_supported_mimetypes(self, mime):
         conv = _make_converter()
@@ -127,6 +131,69 @@ def test_restricted_to_audio(self):
         assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp3"))
         assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))
 
+    def test_webm_value_matches_cli_input(self):
+        assert ContentUnderstandingFileType("webm") == ContentUnderstandingFileType.WEBM
+
+    def test_m4v_value_matches_cli_input(self):
+        assert ContentUnderstandingFileType("m4v") == ContentUnderstandingFileType.M4V
+
+
+# ---------------------------------------------------------------------------
+# file type detection tests
+# ---------------------------------------------------------------------------
+
+class TestDetectFileType:
+    """Test extension and MIME based file type detection."""
+
+    def test_detects_video_from_mime_without_extension(self):
+        assert (
+            _detect_file_type(StreamInfo(mimetype="video/mp4"))
+            == ContentUnderstandingFileType.MP4
+        )
+
+    def test_detects_audio_from_mime_without_extension(self):
+        assert (
+            _detect_file_type(StreamInfo(mimetype="audio/mpeg"))
+            == ContentUnderstandingFileType.MP3
+        )
+
+    def test_detects_audio_alias_from_mime_without_extension(self):
+        assert (
+            _detect_file_type(StreamInfo(mimetype="audio/x-wav"))
+            == ContentUnderstandingFileType.WAV
+        )
+
+    def test_detects_video_alias_from_mime_without_extension(self):
+        assert (
+            _detect_file_type(StreamInfo(mimetype="video/x-m4v"))
+            == ContentUnderstandingFileType.M4V
+        )
+
+    @pytest.mark.parametrize(("mimetype", "expected"), [
+        ("audio/x-wav", "audio/wav"),
+        ("audio/x-flac", "audio/flac"),
+        ("audio/x-m4a", "audio/mp4"),
+        ("video/x-m4v", "video/mp4"),
+        ("video/mp4", "video/mp4"),
+        (None, "application/octet-stream"),
+    ])
+    def test_canonical_mime_type(self, mimetype, expected):
+        assert _canonical_mime_type(mimetype) == expected
+
+    @pytest.mark.parametrize(("file_type", "mimetype", "expected"), [
+        (ContentUnderstandingFileType.PDF, None, "application/pdf"),
+        (ContentUnderstandingFileType.M4V, None, "video/mp4"),
+        (ContentUnderstandingFileType.FLAC, "audio/x-flac", "audio/flac"),
+    ])
+    def test_content_type_for(self, file_type, mimetype, expected):
+        assert _content_type_for(file_type, mimetype) == expected
+
+    def test_file_type_restriction_applies_to_mime(self):
+        assert _detect_file_type(
+            StreamInfo(mimetype="video/mp4"),
+            [ContentUnderstandingFileType.PDF],
+        ) is None
+
 
 # ---------------------------------------------------------------------------
 # Smart routing tests
@@ -150,7 +217,10 @@ def test_document_analyzer_routes_pdf_to_custom(self):
         conv._client.begin_analyze_binary.return_value = mock_poller
 
         with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
-            conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+            conv.convert(
+                io.BytesIO(b"fake pdf"),
+                StreamInfo(extension=".pdf", mimetype="application/pdf"),
+            )
 
         # Should use the custom analyzer for PDF (document modality)
         call_args = conv._client.begin_analyze_binary.call_args
@@ -171,7 +241,10 @@ def test_document_analyzer_routes_mp3_to_prebuilt(self):
         conv._client.begin_analyze_binary.return_value = mock_poller
 
         with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
-            conv.convert(io.BytesIO(b"fake audio"), StreamInfo(extension=".mp3", mimetype="audio/mpeg"))
+            conv.convert(
+                io.BytesIO(b"fake audio"),
+                StreamInfo(extension=".mp3", mimetype="audio/mpeg"),
+            )
 
         call_args = conv._client.begin_analyze_binary.call_args
         assert call_args.kwargs["analyzer_id"] == "prebuilt-audioSearch"
@@ -191,7 +264,10 @@ def test_document_analyzer_routes_mp4_to_prebuilt(self):
         conv._client.begin_analyze_binary.return_value = mock_poller
 
         with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
-            conv.convert(io.BytesIO(b"fake video"), StreamInfo(extension=".mp4", mimetype="video/mp4"))
+            conv.convert(
+                io.BytesIO(b"fake video"),
+                StreamInfo(extension=".mp4", mimetype="video/mp4"),
+            )
 
         call_args = conv._client.begin_analyze_binary.call_args
         assert call_args.kwargs["analyzer_id"] == "prebuilt-videoSearch"
@@ -208,10 +284,72 @@ def test_no_analyzer_id_uses_auto_routing(self):
         conv._client.begin_analyze_binary.return_value = mock_poller
 
         with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
-            conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+            conv.convert(
+                io.BytesIO(b"fake pdf"),
+                StreamInfo(extension=".pdf", mimetype="application/pdf"),
+            )
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch"
+
+    @pytest.mark.parametrize(("mimetype", "expected_analyzer"), [
+        ("video/mp4", "prebuilt-videoSearch"),
+        ("video/x-m4v", "prebuilt-videoSearch"),
+        ("audio/mpeg", "prebuilt-audioSearch"),
+        ("audio/x-wav", "prebuilt-audioSearch"),
+    ])
+    def test_mime_only_input_uses_auto_routing(self, mimetype, expected_analyzer):
+        """MIME-only streams should route to the matching modality analyzer."""
+        conv = _make_converter(analyzer_id=None, analyzer_modality=None)
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(io.BytesIO(b"fake content"), StreamInfo(mimetype=mimetype))
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == expected_analyzer
+
+    def test_mime_alias_input_uses_canonical_content_type(self):
+        """Alias MIME types should be sent to CU as canonical content types."""
+        conv = _make_converter(analyzer_id=None, analyzer_modality=None)
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(io.BytesIO(b"fake video"), StreamInfo(mimetype="video/x-m4v"))
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "prebuilt-videoSearch"
+        assert call_args.kwargs["content_type"] == "video/mp4"
+
+    def test_extension_only_input_uses_file_type_content_type(self):
+        """Extension-only inputs should send CU a matching content type."""
+        conv = _make_converter(analyzer_id=None, analyzer_modality=None)
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(io.BytesIO(b"fake pdf"), StreamInfo(extension=".pdf"))
 
         call_args = conv._client.begin_analyze_binary.call_args
         assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch"
+        assert call_args.kwargs["content_type"] == "application/pdf"
 
 
 # ---------------------------------------------------------------------------
@@ -293,15 +431,21 @@ def _run_convert(self, extension, mimetype, expected_output="mock output"):
         return result
 
     def test_pdf_returns_markdown(self):
-        result = self._run_convert(".pdf", "application/pdf", "---\ncontentType: document\n---\n# Test")
+        result = self._run_convert(
+            ".pdf", "application/pdf", "---\ncontentType: document\n---\n# Test"
+        )
         assert "contentType: document" in result.markdown
 
     def test_mp4_returns_markdown(self):
-        result = self._run_convert(".mp4", "video/mp4", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hello")
+        result = self._run_convert(
+            ".mp4", "video/mp4", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hello"
+        )
         assert "contentType: audioVisual" in result.markdown
 
     def test_wav_returns_markdown(self):
-        result = self._run_convert(".wav", "audio/wav", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hi")
+        result = self._run_convert(
+            ".wav", "audio/wav", "---\ncontentType: audioVisual\n---\nSpeaker 1: Hi"
+        )
         assert "audioVisual" in result.markdown
 
     def test_empty_result(self):

From 24ba4f2361fa3a24f59f82c672aa2a1a0907109d Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Wed, 6 May 2026 15:11:01 -0700
Subject: [PATCH 3/9] prebuilt-image custom analzyer route to image

---
 .../markitdown/converters/_cu_converter.py    |  21 +++-
 .../markitdown/tests/test_cu_converter.py     | 100 +++++++++++++++++-
 2 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
index b2f6c80ba..b3a77494f 100644
--- a/packages/markitdown/src/markitdown/converters/_cu_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -206,6 +206,9 @@ class ContentUnderstandingFileType(str, Enum):
     ContentUnderstandingFileType.XML,
     ContentUnderstandingFileType.EML,
     ContentUnderstandingFileType.MSG,
+}
+
+_IMAGE_TYPES = {
     ContentUnderstandingFileType.JPEG,
     ContentUnderstandingFileType.PNG,
     ContentUnderstandingFileType.BMP,
@@ -236,6 +239,7 @@ class ContentUnderstandingFileType(str, Enum):
 
 _DEFAULT_ANALYZERS = {
     "document": "prebuilt-documentSearch",
+    "image": "prebuilt-documentSearch",
     "video": "prebuilt-videoSearch",
     "audio": "prebuilt-audioSearch",
 }
@@ -248,6 +252,8 @@ def _get_modality(file_type: ContentUnderstandingFileType) -> str:
     """Get the modality category for a file type."""
     if file_type in _DOCUMENT_TYPES:
         return "document"
+    elif file_type in _IMAGE_TYPES:
+        return "image"
     elif file_type in _VIDEO_TYPES:
         return "video"
     elif file_type in _AUDIO_TYPES:
@@ -317,7 +323,7 @@ def _detect_file_type_from_mime(
 
 _BASE_TO_MODALITY: Dict[str, str] = {
     "prebuilt-document": "document",
-    "prebuilt-image": "document",  # CU images return kind="document"
+    "prebuilt-image": "image",
     "prebuilt-audio": "audio",
     "prebuilt-video": "video",
 }
@@ -359,8 +365,8 @@ def _detect_file_type_from_mime(
     "prebuilt-mortgage.us.1003": "document",
     "prebuilt-mortgage.us.closingDisclosure": "document",
     # Image-based prebuilts
-    "prebuilt-image": "document",  # images are document modality in CU
-    "prebuilt-imageSearch": "document",
+    "prebuilt-image": "image",
+    "prebuilt-imageSearch": "image",
     # Audio-based prebuilts
     "prebuilt-audio": "audio",
     "prebuilt-audioSearch": "audio",
@@ -380,6 +386,13 @@ def _infer_prebuilt_modality(analyzer_id: str) -> str:
     return "document"
 
 
+def _is_analyzer_compatible(file_modality: str, analyzer_modality: str) -> bool:
+    """Return True when an analyzer modality can process a file modality."""
+    if analyzer_modality == "document":
+        return file_modality in {"document", "image"}
+    return file_modality == analyzer_modality
+
+
 # ---------------------------------------------------------------------------
 # Converter
 # ---------------------------------------------------------------------------
@@ -495,7 +508,7 @@ def convert(
         if (
             self._analyzer_id is not None
             and self._analyzer_modality is not None
-            and file_modality == self._analyzer_modality
+            and _is_analyzer_compatible(file_modality, self._analyzer_modality)
         ):
             analyzer_id = self._analyzer_id
         else:
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
index b5e87ba34..7cd556461 100644
--- a/packages/markitdown/tests/test_cu_converter.py
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -292,6 +292,95 @@ def test_no_analyzer_id_uses_auto_routing(self):
         call_args = conv._client.begin_analyze_binary.call_args
         assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch"
 
+    def test_no_analyzer_id_routes_image_to_document_search(self):
+        """Default image routing should still use prebuilt-documentSearch."""
+        conv = _make_converter(analyzer_id=None, analyzer_modality=None)
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(
+                io.BytesIO(b"fake image"),
+                StreamInfo(extension=".jpg", mimetype="image/jpeg"),
+            )
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch"
+
+    def test_document_analyzer_routes_image_to_custom(self):
+        """Document-based analyzers should still handle image documents."""
+        conv = _make_converter(
+            analyzer_id="my-doc-analyzer",
+            analyzer_modality="document",
+        )
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(
+                io.BytesIO(b"fake image"),
+                StreamInfo(extension=".jpg", mimetype="image/jpeg"),
+            )
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "my-doc-analyzer"
+
+    def test_image_analyzer_routes_jpeg_to_custom(self):
+        """Image-based analyzers should be used for image files."""
+        conv = _make_converter(
+            analyzer_id="my-image-analyzer",
+            analyzer_modality="image",
+        )
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(
+                io.BytesIO(b"fake image"),
+                StreamInfo(extension=".jpg", mimetype="image/jpeg"),
+            )
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "my-image-analyzer"
+
+    def test_image_analyzer_routes_pdf_to_document_prebuilt(self):
+        """Image-based analyzers should not claim non-image document files."""
+        conv = _make_converter(
+            analyzer_id="my-image-analyzer",
+            analyzer_modality="image",
+        )
+        conv._client = MagicMock()
+        mock_result = MagicMock()
+        mock_result.contents = []
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = mock_result
+
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch("markitdown.converters._cu_converter.to_llm_input", return_value=""):
+            conv.convert(
+                io.BytesIO(b"fake pdf"),
+                StreamInfo(extension=".pdf", mimetype="application/pdf"),
+            )
+
+        call_args = conv._client.begin_analyze_binary.call_args
+        assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch"
+
     @pytest.mark.parametrize(("mimetype", "expected_analyzer"), [
         ("video/mp4", "prebuilt-videoSearch"),
         ("video/x-m4v", "prebuilt-videoSearch"),
@@ -374,9 +463,9 @@ def test_video_prebuilts(self):
         assert _infer_prebuilt_modality("prebuilt-videoSearch") == "video"
         assert _infer_prebuilt_modality("prebuilt-videoSynopsis") == "video"
 
-    def test_image_prebuilts_map_to_document(self):
-        assert _infer_prebuilt_modality("prebuilt-imageSearch") == "document"
-        assert _infer_prebuilt_modality("prebuilt-image") == "document"
+    def test_image_prebuilts_map_to_image(self):
+        assert _infer_prebuilt_modality("prebuilt-imageSearch") == "image"
+        assert _infer_prebuilt_modality("prebuilt-image") == "image"
 
     def test_unknown_prebuilt_defaults_to_document(self):
         assert _infer_prebuilt_modality("prebuilt-unknownNewAnalyzer") == "document"
@@ -392,7 +481,10 @@ class TestGetModality:
     def test_document_types(self):
         assert _get_modality(ContentUnderstandingFileType.PDF) == "document"
         assert _get_modality(ContentUnderstandingFileType.DOCX) == "document"
-        assert _get_modality(ContentUnderstandingFileType.JPEG) == "document"
+
+    def test_image_types(self):
+        assert _get_modality(ContentUnderstandingFileType.JPEG) == "image"
+        assert _get_modality(ContentUnderstandingFileType.PNG) == "image"
 
     def test_video_types(self):
         assert _get_modality(ContentUnderstandingFileType.MP4) == "video"

From d91d5ddb16062ef41881cb91e001d97083361304 Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Wed, 6 May 2026 16:07:06 -0700
Subject: [PATCH 4/9] enhance cu priority over di

---
 packages/markitdown/pyproject.toml            |  1 +
 .../markitdown/src/markitdown/__main__.py     | 19 +++---
 .../markitdown/converters/_cu_converter.py    |  7 +-
 .../markitdown/tests/test_cu_converter.py     | 65 +++++++++++++++++++
 4 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
index 8366c0754..d4c20a402 100644
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -59,6 +59,7 @@ outlook = ["olefile"]
 audio-transcription = ["pydub", "SpeechRecognition"]
 youtube-transcription = ["youtube-transcript-api"]
 az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
+# >=1.2.0b1 required for to_llm_input() helper used by ContentUnderstandingConverter
 az-content-understanding = ["azure-ai-contentunderstanding>=1.2.0b1", "azure-identity"]
 
 [project.urls]
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index d57b2ae65..ac7d2f602 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -78,21 +78,15 @@ def main():
         help="Provide a hint about the file's charset (e.g, UTF-8).",
     )
 
-    parser.add_argument(
+    cloud_group = parser.add_mutually_exclusive_group()
+    cloud_group.add_argument(
         "-d",
         "--use-docintel",
         action="store_true",
         help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
     )
 
-    parser.add_argument(
-        "-e",
-        "--endpoint",
-        type=str,
-        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
-    )
-
-    parser.add_argument(
+    cloud_group.add_argument(
         "--use-cu",
         "--use-content-understanding",
         action="store_true",
@@ -100,6 +94,13 @@ def main():
         help="Use Azure Content Understanding to extract text. Requires --cu-endpoint.",
     )
 
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        type=str,
+        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
+    )
+
     parser.add_argument(
         "--cu-endpoint",
         type=str,
diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
index b3a77494f..de597f946 100644
--- a/packages/markitdown/src/markitdown/converters/_cu_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -474,7 +474,12 @@ def __init__(
                 self._analyzer_modality = _infer_prebuilt_modality(self._analyzer_id)
             else:
                 # Custom analyzer — one get_analyzer() call, cached
-                analyzer_info = self._client.get_analyzer(self._analyzer_id)
+                try:
+                    analyzer_info = self._client.get_analyzer(self._analyzer_id)
+                except Exception as exc:
+                    raise ValueError(
+                        f"Failed to resolve analyzer '{self._analyzer_id}': {exc}"
+                    ) from exc
                 if analyzer_info.base_analyzer_id:
                     self._analyzer_modality = _BASE_TO_MODALITY.get(
                         analyzer_info.base_analyzer_id, "document"
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
index 7cd556461..3eb88bbb4 100644
--- a/packages/markitdown/tests/test_cu_converter.py
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -545,6 +545,71 @@ def test_empty_result(self):
         assert result.markdown == ""
 
 
+# ---------------------------------------------------------------------------
+# Init-time get_analyzer() error wrapping
+# ---------------------------------------------------------------------------
+
+class TestGetAnalyzerError:
+    """Test that get_analyzer() failures at init produce a clear error."""
+
+    def test_nonexistent_analyzer_raises_value_error(self):
+        """A failed get_analyzer() should raise ValueError with analyzer name."""
+        with patch(
+            "markitdown.converters._cu_converter._dependency_exc_info", None
+        ), patch(
+            "markitdown.converters._cu_converter.ContentUnderstandingClient"
+        ) as MockClient, patch(
+            "markitdown.converters._cu_converter.DefaultAzureCredential"
+        ):
+            mock_client = MagicMock()
+            mock_client.get_analyzer.side_effect = Exception("not found")
+            MockClient.return_value = mock_client
+
+            with pytest.raises(ValueError, match="Failed to resolve analyzer 'bad-id'"):
+                ContentUnderstandingConverter(endpoint="https://fake", analyzer_id="bad-id")
+
+
+# ---------------------------------------------------------------------------
+# Registration priority test
+# ---------------------------------------------------------------------------
+
+class TestRegistrationPriority:
+    """Test that CU converter is registered with higher priority than Doc Intel."""
+
+    def test_cu_registered_before_docintel(self):
+        """When both endpoints are provided, CU should appear before Doc Intel."""
+        with patch(
+            "markitdown.converters._cu_converter._dependency_exc_info", None
+        ), patch(
+            "markitdown.converters._cu_converter.ContentUnderstandingClient"
+        ), patch(
+            "markitdown.converters._cu_converter.DefaultAzureCredential"
+        ), patch(
+            "markitdown.converters._doc_intel_converter._dependency_exc_info", None
+        ), patch(
+            "markitdown.converters._doc_intel_converter.DocumentIntelligenceClient"
+        ), patch(
+            "markitdown.converters._doc_intel_converter.DefaultAzureCredential"
+        ):
+            from markitdown import MarkItDown
+            from markitdown.converters import (
+                ContentUnderstandingConverter,
+                DocumentIntelligenceConverter,
+            )
+
+            md = MarkItDown(
+                cu_endpoint="https://fake-cu",
+                docintel_endpoint="https://fake-di",
+            )
+
+            converter_types = [
+                type(reg.converter) for reg in md._converters
+            ]
+            cu_idx = converter_types.index(ContentUnderstandingConverter)
+            di_idx = converter_types.index(DocumentIntelligenceConverter)
+            assert cu_idx < di_idx, "CU should have higher priority (lower index) than Doc Intel"
+
+
 # ---------------------------------------------------------------------------
 # MissingDependencyException test
 # ---------------------------------------------------------------------------

From f5e700838e358f46a21002130fe6882535e02872 Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Wed, 6 May 2026 16:33:46 -0700
Subject: [PATCH 5/9] fix: apply black formatting

---
 .../markitdown/src/markitdown/__main__.py     |   4 +-
 .../markitdown/converters/_cu_converter.py    |   8 +-
 .../markitdown/tests/test_cu_converter.py     | 204 ++++++++++++------
 3 files changed, 148 insertions(+), 68 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index ac7d2f602..ccb44b64b 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -228,7 +228,9 @@ def main():
             # Parse comma-separated file types into ContentUnderstandingFileType list
             from .converters import ContentUnderstandingFileType
 
-            type_names = [t.strip().lower() for t in args.cu_file_types.split(",") if t.strip()]
+            type_names = [
+                t.strip().lower() for t in args.cu_file_types.split(",") if t.strip()
+            ]
             cu_types = []
             for name in type_names:
                 # Try matching by value (e.g., "pdf", "jpeg", "mp4")
diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
index de597f946..3b1380c14 100644
--- a/packages/markitdown/src/markitdown/converters/_cu_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -507,7 +507,9 @@ def convert(
         # 1. Determine analyzer_id (smart routing: check modality)
         file_type = _detect_file_type(stream_info, self._file_types)
         if file_type is None:
-            raise ValueError("Unsupported file type for Content Understanding conversion.")
+            raise ValueError(
+                "Unsupported file type for Content Understanding conversion."
+            )
         file_modality = _get_modality(file_type)
 
         if (
@@ -517,7 +519,9 @@ def convert(
         ):
             analyzer_id = self._analyzer_id
         else:
-            analyzer_id = _DEFAULT_ANALYZERS.get(file_modality, "prebuilt-documentSearch")
+            analyzer_id = _DEFAULT_ANALYZERS.get(
+                file_modality, "prebuilt-documentSearch"
+            )
 
         # 2. Read file bytes and determine MIME type
         file_bytes = file_stream.read()
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
index 3eb88bbb4..70c51a5d1 100644
--- a/packages/markitdown/tests/test_cu_converter.py
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -26,6 +26,7 @@
 # Helper: create a converter with accepts() working but no SDK init
 # ---------------------------------------------------------------------------
 
+
 def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None):
     """Create a converter bypassing __init__ (no SDK deps needed)."""
     conv = ContentUnderstandingConverter.__new__(ContentUnderstandingConverter)
@@ -47,23 +48,64 @@ def _make_converter(file_types=None, analyzer_id=None, analyzer_modality=None):
 # accepts() tests — extension-based
 # ---------------------------------------------------------------------------
 
+
 class TestAcceptsExtension:
     """Test accepts() for supported and unsupported file extensions."""
 
-    @pytest.mark.parametrize("ext", [
-        ".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".md", ".rtf", ".xml",
-        ".eml", ".msg",
-        ".jpg", ".jpeg", ".jpe", ".png", ".bmp", ".tiff", ".heif", ".heic",
-        ".mp4", ".m4v", ".mov", ".avi", ".mkv", ".webm", ".flv", ".wmv",
-        ".wav", ".mp3", ".m4a", ".flac", ".ogg", ".aac", ".wma",
-    ])
+    @pytest.mark.parametrize(
+        "ext",
+        [
+            ".pdf",
+            ".docx",
+            ".pptx",
+            ".xlsx",
+            ".html",
+            ".txt",
+            ".md",
+            ".rtf",
+            ".xml",
+            ".eml",
+            ".msg",
+            ".jpg",
+            ".jpeg",
+            ".jpe",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".heif",
+            ".heic",
+            ".mp4",
+            ".m4v",
+            ".mov",
+            ".avi",
+            ".mkv",
+            ".webm",
+            ".flv",
+            ".wmv",
+            ".wav",
+            ".mp3",
+            ".m4a",
+            ".flac",
+            ".ogg",
+            ".aac",
+            ".wma",
+        ],
+    )
     def test_accepts_supported_extensions(self, ext):
         conv = _make_converter()
         assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext))
 
-    @pytest.mark.parametrize("ext", [
-        ".csv", ".json", ".zip", ".epub", ".py", ".rs",
-    ])
+    @pytest.mark.parametrize(
+        "ext",
+        [
+            ".csv",
+            ".json",
+            ".zip",
+            ".epub",
+            ".py",
+            ".rs",
+        ],
+    )
     def test_rejects_unsupported_extensions(self, ext):
         conv = _make_converter()
         assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=ext))
@@ -73,36 +115,43 @@ def test_rejects_unsupported_extensions(self, ext):
 # accepts() tests — MIME-based
 # ---------------------------------------------------------------------------
 
+
 class TestAcceptsMime:
     """Test accepts() for MIME type matching."""
 
-    @pytest.mark.parametrize("mime", [
-        "application/pdf",
-        "image/jpeg",
-        "video/mp4",
-        "audio/wav",
-        "audio/x-wav",
-        "text/html",
-        "audio/mpeg",
-        "audio/x-m4a",
-        "audio/x-flac",
-        "video/quicktime",
-        "video/webm",
-        "video/x-m4v",
-        "video/x-flv",
-        "video/x-ms-wmv",
-        "audio/aac",
-        "audio/x-ms-wma",
-    ])
+    @pytest.mark.parametrize(
+        "mime",
+        [
+            "application/pdf",
+            "image/jpeg",
+            "video/mp4",
+            "audio/wav",
+            "audio/x-wav",
+            "text/html",
+            "audio/mpeg",
+            "audio/x-m4a",
+            "audio/x-flac",
+            "video/quicktime",
+            "video/webm",
+            "video/x-m4v",
+            "video/x-flv",
+            "video/x-ms-wmv",
+            "audio/aac",
+            "audio/x-ms-wma",
+        ],
+    )
     def test_accepts_supported_mimetypes(self, mime):
         conv = _make_converter()
         assert conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime))
 
-    @pytest.mark.parametrize("mime", [
-        "text/csv",
-        "application/json",
-        "application/zip",
-    ])
+    @pytest.mark.parametrize(
+        "mime",
+        [
+            "text/csv",
+            "application/json",
+            "application/zip",
+        ],
+    )
     def test_rejects_unsupported_mimetypes(self, mime):
         conv = _make_converter()
         assert not conv.accepts(io.BytesIO(b""), StreamInfo(mimetype=mime))
@@ -112,6 +161,7 @@ def test_rejects_unsupported_mimetypes(self, mime):
 # accepts() tests — cu_file_types restriction
 # ---------------------------------------------------------------------------
 
+
 class TestAcceptsFileTypeRestriction:
     """Test that cu_file_types restricts which formats are accepted."""
 
@@ -123,10 +173,12 @@ def test_restricted_to_pdf_only(self):
         assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".jpg"))
 
     def test_restricted_to_audio(self):
-        conv = _make_converter(file_types=[
-            ContentUnderstandingFileType.WAV,
-            ContentUnderstandingFileType.MP3,
-        ])
+        conv = _make_converter(
+            file_types=[
+                ContentUnderstandingFileType.WAV,
+                ContentUnderstandingFileType.MP3,
+            ]
+        )
         assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".wav"))
         assert conv.accepts(io.BytesIO(b""), StreamInfo(extension=".mp3"))
         assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))
@@ -142,6 +194,7 @@ def test_m4v_value_matches_cli_input(self):
 # file type detection tests
 # ---------------------------------------------------------------------------
 
+
 class TestDetectFileType:
     """Test extension and MIME based file type detection."""
 
@@ -169,36 +222,46 @@ def test_detects_video_alias_from_mime_without_extension(self):
             == ContentUnderstandingFileType.M4V
         )
 
-    @pytest.mark.parametrize(("mimetype", "expected"), [
-        ("audio/x-wav", "audio/wav"),
-        ("audio/x-flac", "audio/flac"),
-        ("audio/x-m4a", "audio/mp4"),
-        ("video/x-m4v", "video/mp4"),
-        ("video/mp4", "video/mp4"),
-        (None, "application/octet-stream"),
-    ])
+    @pytest.mark.parametrize(
+        ("mimetype", "expected"),
+        [
+            ("audio/x-wav", "audio/wav"),
+            ("audio/x-flac", "audio/flac"),
+            ("audio/x-m4a", "audio/mp4"),
+            ("video/x-m4v", "video/mp4"),
+            ("video/mp4", "video/mp4"),
+            (None, "application/octet-stream"),
+        ],
+    )
     def test_canonical_mime_type(self, mimetype, expected):
         assert _canonical_mime_type(mimetype) == expected
 
-    @pytest.mark.parametrize(("file_type", "mimetype", "expected"), [
-        (ContentUnderstandingFileType.PDF, None, "application/pdf"),
-        (ContentUnderstandingFileType.M4V, None, "video/mp4"),
-        (ContentUnderstandingFileType.FLAC, "audio/x-flac", "audio/flac"),
-    ])
+    @pytest.mark.parametrize(
+        ("file_type", "mimetype", "expected"),
+        [
+            (ContentUnderstandingFileType.PDF, None, "application/pdf"),
+            (ContentUnderstandingFileType.M4V, None, "video/mp4"),
+            (ContentUnderstandingFileType.FLAC, "audio/x-flac", "audio/flac"),
+        ],
+    )
     def test_content_type_for(self, file_type, mimetype, expected):
         assert _content_type_for(file_type, mimetype) == expected
 
     def test_file_type_restriction_applies_to_mime(self):
-        assert _detect_file_type(
-            StreamInfo(mimetype="video/mp4"),
-            [ContentUnderstandingFileType.PDF],
-        ) is None
+        assert (
+            _detect_file_type(
+                StreamInfo(mimetype="video/mp4"),
+                [ContentUnderstandingFileType.PDF],
+            )
+            is None
+        )
 
 
 # ---------------------------------------------------------------------------
 # Smart routing tests
 # ---------------------------------------------------------------------------
 
+
 class TestSmartRouting:
     """Test modality-aware analyzer routing."""
 
@@ -381,12 +444,15 @@ def test_image_analyzer_routes_pdf_to_document_prebuilt(self):
         call_args = conv._client.begin_analyze_binary.call_args
         assert call_args.kwargs["analyzer_id"] == "prebuilt-documentSearch"
 
-    @pytest.mark.parametrize(("mimetype", "expected_analyzer"), [
-        ("video/mp4", "prebuilt-videoSearch"),
-        ("video/x-m4v", "prebuilt-videoSearch"),
-        ("audio/mpeg", "prebuilt-audioSearch"),
-        ("audio/x-wav", "prebuilt-audioSearch"),
-    ])
+    @pytest.mark.parametrize(
+        ("mimetype", "expected_analyzer"),
+        [
+            ("video/mp4", "prebuilt-videoSearch"),
+            ("video/x-m4v", "prebuilt-videoSearch"),
+            ("audio/mpeg", "prebuilt-audioSearch"),
+            ("audio/x-wav", "prebuilt-audioSearch"),
+        ],
+    )
     def test_mime_only_input_uses_auto_routing(self, mimetype, expected_analyzer):
         """MIME-only streams should route to the matching modality analyzer."""
         conv = _make_converter(analyzer_id=None, analyzer_modality=None)
@@ -445,6 +511,7 @@ def test_extension_only_input_uses_file_type_content_type(self):
 # _infer_prebuilt_modality tests
 # ---------------------------------------------------------------------------
 
+
 class TestInferPrebuiltModality:
     """Test modality inference from prebuilt analyzer names."""
 
@@ -475,6 +542,7 @@ def test_unknown_prebuilt_defaults_to_document(self):
 # _get_modality tests
 # ---------------------------------------------------------------------------
 
+
 class TestGetModality:
     """Test file type → modality mapping."""
 
@@ -499,6 +567,7 @@ def test_audio_types(self):
 # convert() mock tests
 # ---------------------------------------------------------------------------
 
+
 class TestConvertMock:
     """Test convert() with mocked CU SDK."""
 
@@ -549,6 +618,7 @@ def test_empty_result(self):
 # Init-time get_analyzer() error wrapping
 # ---------------------------------------------------------------------------
 
+
 class TestGetAnalyzerError:
     """Test that get_analyzer() failures at init produce a clear error."""
 
@@ -566,13 +636,16 @@ def test_nonexistent_analyzer_raises_value_error(self):
             MockClient.return_value = mock_client
 
             with pytest.raises(ValueError, match="Failed to resolve analyzer 'bad-id'"):
-                ContentUnderstandingConverter(endpoint="https://fake", analyzer_id="bad-id")
+                ContentUnderstandingConverter(
+                    endpoint="https://fake", analyzer_id="bad-id"
+                )
 
 
 # ---------------------------------------------------------------------------
 # Registration priority test
 # ---------------------------------------------------------------------------
 
+
 class TestRegistrationPriority:
     """Test that CU converter is registered with higher priority than Doc Intel."""
 
@@ -602,18 +675,19 @@ def test_cu_registered_before_docintel(self):
                 docintel_endpoint="https://fake-di",
             )
 
-            converter_types = [
-                type(reg.converter) for reg in md._converters
-            ]
+            converter_types = [type(reg.converter) for reg in md._converters]
             cu_idx = converter_types.index(ContentUnderstandingConverter)
             di_idx = converter_types.index(DocumentIntelligenceConverter)
-            assert cu_idx < di_idx, "CU should have higher priority (lower index) than Doc Intel"
+            assert (
+                cu_idx < di_idx
+            ), "CU should have higher priority (lower index) than Doc Intel"
 
 
 # ---------------------------------------------------------------------------
 # MissingDependencyException test
 # ---------------------------------------------------------------------------
 
+
 class TestMissingDependency:
     """Test that MissingDependencyException is raised when CU SDK is not installed."""
 

From e4b585a83268d9897ffdd4a53f75c4431c261ec0 Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Thu, 7 May 2026 13:28:21 -0700
Subject: [PATCH 6/9] update cache of known prebuilt name and README
 improvement

---
 README.md                                     |   9 +
 .../markitdown/converters/_cu_converter.py    |  69 ++++---
 .../markitdown/tests/test_cu_converter.py     | 168 +++++++++++++++---
 3 files changed, 194 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 005d7bdc0..dd9edffdc 100644
--- a/README.md
+++ b/README.md
@@ -175,6 +175,15 @@ More information about how to set up an Azure Document Intelligence Resource can
 
 Install: `pip install 'markitdown[az-content-understanding]'`
 
+#### When to use Content Understanding
+
+Content Understanding is ideal when you need capabilities beyond what built-in or Document Intelligence converters provide:
+
+- **Audio and video files** — CU is the only option for converting MP4, MOV, WAV, MP3, and other media files. Built-in converters have no video support and only basic audio transcription.
+- **Structured field extraction** — Custom analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields.
+- **Higher-quality document extraction** — Cloud-based layout analysis and OCR for scanned PDFs, complex tables, and multi-page documents.
+- **Single API for all modalities** — One `cu_endpoint` handles documents, images, audio, and video with automatic analyzer routing.
+
 | Capability | Built-in converters | Azure Document Intelligence | Azure Content Understanding |
 |------------|---------------------|-----------------------------|-----------------------------|
 | Document conversion | Offline, format-specific extraction | Cloud layout extraction | Cloud multimodal extraction |
diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
index 3b1380c14..cfe8fd979 100644
--- a/packages/markitdown/src/markitdown/converters/_cu_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -237,7 +237,7 @@ class ContentUnderstandingFileType(str, Enum):
     ContentUnderstandingFileType.WMA,
 }
 
-_DEFAULT_ANALYZERS = {
+_PREBUILT_ANALYZERS = {
     "document": "prebuilt-documentSearch",
     "image": "prebuilt-documentSearch",
     "video": "prebuilt-videoSearch",
@@ -328,8 +328,8 @@ def _detect_file_type_from_mime(
     "prebuilt-video": "video",
 }
 
-# For prebuilt analyzers, infer modality from name without an API call
-_PREBUILT_MODALITY: Dict[str, str] = {
+# Cache of known prebuilt analyzer name → modality (avoids API call)
+_KNOWN_PREBUILT_MODALITY: Dict[str, str] = {
     # Document-based prebuilts
     "prebuilt-documentSearch": "document",
     "prebuilt-layout": "document",
@@ -378,11 +378,40 @@ def _detect_file_type_from_mime(
 }
 
 
-def _infer_prebuilt_modality(analyzer_id: str) -> str:
-    """Infer modality from a prebuilt analyzer ID without an API call."""
-    if analyzer_id in _PREBUILT_MODALITY:
-        return _PREBUILT_MODALITY[analyzer_id]
-    # Unknown prebuilt — most prebuilts are document-based
+def _resolve_analyzer_modality(client: Any, analyzer_id: str) -> str:
+    """Resolve analyzer modality from cache or via get_analyzer() fallback.
+
+    For known prebuilt-* names, returns the modality from
+    ``_KNOWN_PREBUILT_MODALITY`` without an API call.  For unknown
+    prebuilt-* names or custom analyzers, calls ``get_analyzer()``
+    to inspect ``base_analyzer_id``.
+
+    Args:
+        client: A ``ContentUnderstandingClient`` instance.
+        analyzer_id: The analyzer ID to resolve.
+
+    Returns:
+        Modality string ("document", "image", "audio", or "video").
+
+    Raises:
+        ValueError: If ``get_analyzer()`` fails.
+    """
+    # Known prebuilt — use cache, no API call
+    if analyzer_id in _KNOWN_PREBUILT_MODALITY:
+        return _KNOWN_PREBUILT_MODALITY[analyzer_id]
+
+    # Unknown prebuilt or custom analyzer — call get_analyzer()
+    try:
+        analyzer_info = client.get_analyzer(analyzer_id)
+    except Exception as exc:
+        raise ValueError(
+            f"Failed to resolve analyzer '{analyzer_id}': {exc}"
+        ) from exc
+
+    if analyzer_info.base_analyzer_id:
+        return _BASE_TO_MODALITY.get(
+            analyzer_info.base_analyzer_id, "document"
+        )
     return "document"
 
 
@@ -467,25 +496,11 @@ def __init__(
             user_agent_policy=UserAgentPolicy(user_agent=user_agent),
         )
 
-        # Smart routing: resolve analyzer modality at init
+        # Smart routing: resolve analyzer modality at init (at most one API call)
         if self._analyzer_id is not None:
-            if self._analyzer_id.startswith("prebuilt-"):
-                # Infer from name — no API call
-                self._analyzer_modality = _infer_prebuilt_modality(self._analyzer_id)
-            else:
-                # Custom analyzer — one get_analyzer() call, cached
-                try:
-                    analyzer_info = self._client.get_analyzer(self._analyzer_id)
-                except Exception as exc:
-                    raise ValueError(
-                        f"Failed to resolve analyzer '{self._analyzer_id}': {exc}"
-                    ) from exc
-                if analyzer_info.base_analyzer_id:
-                    self._analyzer_modality = _BASE_TO_MODALITY.get(
-                        analyzer_info.base_analyzer_id, "document"
-                    )
-                else:
-                    self._analyzer_modality = "document"
+            self._analyzer_modality = _resolve_analyzer_modality(
+                self._client, self._analyzer_id
+            )
 
     def accepts(
         self,
@@ -519,7 +534,7 @@ def convert(
         ):
             analyzer_id = self._analyzer_id
         else:
-            analyzer_id = _DEFAULT_ANALYZERS.get(
+            analyzer_id = _PREBUILT_ANALYZERS.get(
                 file_modality, "prebuilt-documentSearch"
             )
 
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
index 70c51a5d1..2f4c5e8d4 100644
--- a/packages/markitdown/tests/test_cu_converter.py
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -5,6 +5,7 @@
 """
 
 import io
+import sys
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -12,7 +13,7 @@
 from markitdown.converters._cu_converter import (
     ContentUnderstandingConverter,
     ContentUnderstandingFileType,
-    _infer_prebuilt_modality,
+    _resolve_analyzer_modality,
     _get_modality,
     _detect_file_type,
     _canonical_mime_type,
@@ -512,30 +513,76 @@ def test_extension_only_input_uses_file_type_content_type(self):
 # ---------------------------------------------------------------------------
 
 
-class TestInferPrebuiltModality:
-    """Test modality inference from prebuilt analyzer names."""
-
-    def test_document_prebuilts(self):
-        assert _infer_prebuilt_modality("prebuilt-documentSearch") == "document"
-        assert _infer_prebuilt_modality("prebuilt-invoice") == "document"
-        assert _infer_prebuilt_modality("prebuilt-layout") == "document"
-        assert _infer_prebuilt_modality("prebuilt-receipt") == "document"
-        assert _infer_prebuilt_modality("prebuilt-tax.us.w2") == "document"
-
-    def test_audio_prebuilts(self):
-        assert _infer_prebuilt_modality("prebuilt-audioSearch") == "audio"
-        assert _infer_prebuilt_modality("prebuilt-callCenter") == "audio"
-
-    def test_video_prebuilts(self):
-        assert _infer_prebuilt_modality("prebuilt-videoSearch") == "video"
-        assert _infer_prebuilt_modality("prebuilt-videoSynopsis") == "video"
-
-    def test_image_prebuilts_map_to_image(self):
-        assert _infer_prebuilt_modality("prebuilt-imageSearch") == "image"
-        assert _infer_prebuilt_modality("prebuilt-image") == "image"
-
-    def test_unknown_prebuilt_defaults_to_document(self):
-        assert _infer_prebuilt_modality("prebuilt-unknownNewAnalyzer") == "document"
+class TestResolveAnalyzerModality:
+    """Test modality resolution from analyzer IDs."""
+
+    def test_known_document_prebuilts(self):
+        client = MagicMock()
+        assert _resolve_analyzer_modality(client, "prebuilt-documentSearch") == "document"
+        assert _resolve_analyzer_modality(client, "prebuilt-invoice") == "document"
+        assert _resolve_analyzer_modality(client, "prebuilt-layout") == "document"
+        assert _resolve_analyzer_modality(client, "prebuilt-receipt") == "document"
+        assert _resolve_analyzer_modality(client, "prebuilt-tax.us.w2") == "document"
+        # Known prebuilts should never call get_analyzer()
+        client.get_analyzer.assert_not_called()
+
+    def test_known_audio_prebuilts(self):
+        client = MagicMock()
+        assert _resolve_analyzer_modality(client, "prebuilt-audioSearch") == "audio"
+        assert _resolve_analyzer_modality(client, "prebuilt-callCenter") == "audio"
+        client.get_analyzer.assert_not_called()
+
+    def test_known_video_prebuilts(self):
+        client = MagicMock()
+        assert _resolve_analyzer_modality(client, "prebuilt-videoSearch") == "video"
+        assert _resolve_analyzer_modality(client, "prebuilt-videoSynopsis") == "video"
+        client.get_analyzer.assert_not_called()
+
+    def test_known_image_prebuilts(self):
+        client = MagicMock()
+        assert _resolve_analyzer_modality(client, "prebuilt-imageSearch") == "image"
+        assert _resolve_analyzer_modality(client, "prebuilt-image") == "image"
+        client.get_analyzer.assert_not_called()
+
+    def test_unknown_prebuilt_falls_back_to_get_analyzer(self):
+        """Unknown prebuilt-* names should call get_analyzer() for resolution."""
+        client = MagicMock()
+        mock_analyzer = MagicMock()
+        mock_analyzer.base_analyzer_id = "prebuilt-audio"
+        client.get_analyzer.return_value = mock_analyzer
+
+        result = _resolve_analyzer_modality(client, "prebuilt-newAnalyzer")
+        assert result == "audio"
+        client.get_analyzer.assert_called_once_with("prebuilt-newAnalyzer")
+
+    def test_custom_analyzer_calls_get_analyzer(self):
+        """Custom analyzers should call get_analyzer() to resolve modality."""
+        client = MagicMock()
+        mock_analyzer = MagicMock()
+        mock_analyzer.base_analyzer_id = "prebuilt-document"
+        client.get_analyzer.return_value = mock_analyzer
+
+        result = _resolve_analyzer_modality(client, "my-custom-doc-analyzer")
+        assert result == "document"
+        client.get_analyzer.assert_called_once_with("my-custom-doc-analyzer")
+
+    def test_custom_analyzer_no_base_defaults_to_document(self):
+        """Analyzer with no base_analyzer_id defaults to document."""
+        client = MagicMock()
+        mock_analyzer = MagicMock()
+        mock_analyzer.base_analyzer_id = None
+        client.get_analyzer.return_value = mock_analyzer
+
+        result = _resolve_analyzer_modality(client, "my-custom-analyzer")
+        assert result == "document"
+
+    def test_get_analyzer_failure_raises_value_error(self):
+        """Failed get_analyzer() should raise ValueError."""
+        client = MagicMock()
+        client.get_analyzer.side_effect = Exception("not found")
+
+        with pytest.raises(ValueError, match="Failed to resolve analyzer 'bad-id'"):
+            _resolve_analyzer_modality(client, "bad-id")
 
 
 # ---------------------------------------------------------------------------
@@ -613,6 +660,12 @@ def test_empty_result(self):
         result = self._run_convert(".pdf", "application/pdf", "")
         assert result.markdown == ""
 
+    def test_jpeg_returns_markdown(self):
+        result = self._run_convert(
+            ".jpg", "image/jpeg", "---\ncontentType: document\n---\n# Photo"
+        )
+        assert "contentType: document" in result.markdown
+
 
 # ---------------------------------------------------------------------------
 # Init-time get_analyzer() error wrapping
@@ -683,6 +736,71 @@ def test_cu_registered_before_docintel(self):
             ), "CU should have higher priority (lower index) than Doc Intel"
 
 
+# ---------------------------------------------------------------------------
+# CLI argument tests
+# ---------------------------------------------------------------------------
+
+
+class TestCLIArgs:
+    """Test CLI argument parsing for CU flags."""
+
+    def test_use_cu_without_endpoint_exits(self):
+        """--use-cu without --cu-endpoint should exit with error."""
+        import subprocess
+
+        result = subprocess.run(
+            [sys.executable, "-m", "markitdown", "--use-cu", "fake.pdf"],
+            capture_output=True,
+            text=True,
+        )
+        assert result.returncode != 0
+        assert "cu-endpoint" in result.stderr.lower() or "cu-endpoint" in (result.stdout or "").lower()
+
+    def test_use_cu_and_use_docintel_mutually_exclusive(self):
+        """--use-cu and --use-docintel cannot be used together."""
+        import subprocess
+
+        result = subprocess.run(
+            [
+                sys.executable, "-m", "markitdown",
+                "--use-cu", "--cu-endpoint", "https://fake",
+                "--use-docintel", "-e", "https://fake-di",
+                "fake.pdf",
+            ],
+            capture_output=True,
+            text=True,
+        )
+        assert result.returncode != 0
+
+    def test_cu_file_types_parsing(self):
+        """--cu-file-types should parse comma-separated values into enum list."""
+        from markitdown.converters import ContentUnderstandingFileType
+
+        raw = "pdf,jpeg,mp4"
+        type_names = [t.strip().lower() for t in raw.split(",") if t.strip()]
+        cu_types = [ContentUnderstandingFileType(name) for name in type_names]
+
+        assert cu_types == [
+            ContentUnderstandingFileType.PDF,
+            ContentUnderstandingFileType.JPEG,
+            ContentUnderstandingFileType.MP4,
+        ]
+
+    def test_cu_file_types_invalid_value(self):
+        """Unknown file type name should raise ValueError."""
+        from markitdown.converters import ContentUnderstandingFileType
+
+        with pytest.raises(ValueError):
+            ContentUnderstandingFileType("nonsense")
+
+    def test_cu_file_types_single_value(self):
+        """Single file type (no comma) should parse correctly."""
+        from markitdown.converters import ContentUnderstandingFileType
+
+        cu_types = [ContentUnderstandingFileType(t.strip().lower()) for t in "wav".split(",") if t.strip()]
+        assert cu_types == [ContentUnderstandingFileType.WAV]
+
+
 # ---------------------------------------------------------------------------
 # MissingDependencyException test
 # ---------------------------------------------------------------------------

From 6c7f5e78437b43bd84e70e5f6f86cb8f8170e114 Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Thu, 7 May 2026 13:58:27 -0700
Subject: [PATCH 7/9] add test cases, run black

---
 README.md                                     |  2 +-
 .../markitdown/converters/_cu_converter.py    |  8 +-
 .../markitdown/tests/test_cu_converter.py     | 87 ++++++++++++++++---
 3 files changed, 76 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index dd9edffdc..a099e65b4 100644
--- a/README.md
+++ b/README.md
@@ -179,7 +179,7 @@ Install: `pip install 'markitdown[az-content-understanding]'`
 
 Content Understanding is ideal when you need capabilities beyond what built-in or Document Intelligence converters provide:
 
-- **Audio and video files** — CU is the only option for converting MP4, MOV, WAV, MP3, and other media files. Built-in converters have no video support and only basic audio transcription.
+- **Audio and video files** — CU is the only option for video, and the higher-quality cloud option for audio. Built-in converters have no video support and only basic audio transcription.
 - **Structured field extraction** — Custom analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields.
 - **Higher-quality document extraction** — Cloud-based layout analysis and OCR for scanned PDFs, complex tables, and multi-page documents.
 - **Single API for all modalities** — One `cu_endpoint` handles documents, images, audio, and video with automatic analyzer routing.
diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
index cfe8fd979..d3c70494f 100644
--- a/packages/markitdown/src/markitdown/converters/_cu_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -404,14 +404,10 @@ def _resolve_analyzer_modality(client: Any, analyzer_id: str) -> str:
     try:
         analyzer_info = client.get_analyzer(analyzer_id)
     except Exception as exc:
-        raise ValueError(
-            f"Failed to resolve analyzer '{analyzer_id}': {exc}"
-        ) from exc
+        raise ValueError(f"Failed to resolve analyzer '{analyzer_id}': {exc}") from exc
 
     if analyzer_info.base_analyzer_id:
-        return _BASE_TO_MODALITY.get(
-            analyzer_info.base_analyzer_id, "document"
-        )
+        return _BASE_TO_MODALITY.get(analyzer_info.base_analyzer_id, "document")
     return "document"
 
 
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
index 2f4c5e8d4..0cfd31aac 100644
--- a/packages/markitdown/tests/test_cu_converter.py
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -518,7 +518,9 @@ class TestResolveAnalyzerModality:
 
     def test_known_document_prebuilts(self):
         client = MagicMock()
-        assert _resolve_analyzer_modality(client, "prebuilt-documentSearch") == "document"
+        assert (
+            _resolve_analyzer_modality(client, "prebuilt-documentSearch") == "document"
+        )
         assert _resolve_analyzer_modality(client, "prebuilt-invoice") == "document"
         assert _resolve_analyzer_modality(client, "prebuilt-layout") == "document"
         assert _resolve_analyzer_modality(client, "prebuilt-receipt") == "document"
@@ -754,7 +756,10 @@ def test_use_cu_without_endpoint_exits(self):
             text=True,
         )
         assert result.returncode != 0
-        assert "cu-endpoint" in result.stderr.lower() or "cu-endpoint" in (result.stdout or "").lower()
+        assert (
+            "cu-endpoint" in result.stderr.lower()
+            or "cu-endpoint" in (result.stdout or "").lower()
+        )
 
     def test_use_cu_and_use_docintel_mutually_exclusive(self):
         """--use-cu and --use-docintel cannot be used together."""
@@ -762,9 +767,15 @@ def test_use_cu_and_use_docintel_mutually_exclusive(self):
 
         result = subprocess.run(
             [
-                sys.executable, "-m", "markitdown",
-                "--use-cu", "--cu-endpoint", "https://fake",
-                "--use-docintel", "-e", "https://fake-di",
+                sys.executable,
+                "-m",
+                "markitdown",
+                "--use-cu",
+                "--cu-endpoint",
+                "https://fake",
+                "--use-docintel",
+                "-e",
+                "https://fake-di",
                 "fake.pdf",
             ],
             capture_output=True,
@@ -797,9 +808,53 @@ def test_cu_file_types_single_value(self):
         """Single file type (no comma) should parse correctly."""
         from markitdown.converters import ContentUnderstandingFileType
 
-        cu_types = [ContentUnderstandingFileType(t.strip().lower()) for t in "wav".split(",") if t.strip()]
+        cu_types = [
+            ContentUnderstandingFileType(t.strip().lower())
+            for t in "wav".split(",")
+            if t.strip()
+        ]
         assert cu_types == [ContentUnderstandingFileType.WAV]
 
+    def test_use_cu_wires_kwargs_to_markitdown(self, capsys):
+        """--use-cu should pass CU options through to MarkItDown."""
+        import markitdown.__main__ as markitdown_cli
+
+        markitdown_instance = MagicMock()
+        markitdown_instance.convert.return_value.markdown = "converted"
+        markitdown_cls = MagicMock(return_value=markitdown_instance)
+
+        with patch.object(
+            sys,
+            "argv",
+            [
+                "markitdown",
+                "--use-cu",
+                "--cu-endpoint",
+                "https://fake-cu",
+                "--cu-analyzer",
+                "custom-analyzer",
+                "--cu-file-types",
+                "pdf,jpeg,mp4",
+                "fake.pdf",
+            ],
+        ), patch.object(markitdown_cli, "MarkItDown", markitdown_cls):
+            markitdown_cli.main()
+
+        markitdown_cls.assert_called_once_with(
+            enable_plugins=False,
+            cu_endpoint="https://fake-cu",
+            cu_analyzer_id="custom-analyzer",
+            cu_file_types=[
+                ContentUnderstandingFileType.PDF,
+                ContentUnderstandingFileType.JPEG,
+                ContentUnderstandingFileType.MP4,
+            ],
+        )
+        markitdown_instance.convert.assert_called_once_with(
+            "fake.pdf", stream_info=None, keep_data_uris=False
+        )
+        assert capsys.readouterr().out == "converted\n"
+
 
 # ---------------------------------------------------------------------------
 # MissingDependencyException test
@@ -810,13 +865,17 @@ class TestMissingDependency:
     """Test that MissingDependencyException is raised when CU SDK is not installed."""
 
     def test_missing_deps_message(self):
-        """Verify the exception includes install hint."""
-        # We can't easily simulate ImportError in the module, but we can check
-        # the exception message pattern if it were raised.
+        """Converter construction should surface the optional install hint."""
+        import markitdown.converters._cu_converter as cu_converter_module
         from markitdown._exceptions import MissingDependencyException
 
-        exc = MissingDependencyException(
-            "ContentUnderstandingConverter requires the optional dependency "
-            "[az-content-understanding] (or [all]) to be installed."
-        )
-        assert "az-content-understanding" in str(exc)
+        import_error = ImportError("No module named 'azure.ai.contentunderstanding'")
+        dependency_exc_info = (ImportError, import_error, None)
+
+        with patch.object(
+            cu_converter_module, "_dependency_exc_info", dependency_exc_info
+        ), pytest.raises(MissingDependencyException) as exc_info:
+            ContentUnderstandingConverter(endpoint="https://fake-cu")
+
+        assert "az-content-understanding" in str(exc_info.value)
+        assert exc_info.value.__cause__ is import_error

From 7a804cf34b39c1293e61199eebf25e55b39f6a2d Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Thu, 7 May 2026 16:58:11 -0700
Subject: [PATCH 8/9] update readme and deriving content_type from the resolved
 file_type

---
 README.md                                     | 22 ++++-----
 .../markitdown/converters/_cu_converter.py    | 29 ++++++++---
 .../markitdown/tests/test_cu_converter.py     | 49 ++++++++++++++++++-
 3 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index a099e65b4..12bbe3ad9 100644
--- a/README.md
+++ b/README.md
@@ -159,19 +159,9 @@ If no `llm_client` is provided the plugin still loads, but OCR is silently skipp
 
 See [`packages/markitdown-ocr/README.md`](packages/markitdown-ocr/README.md) for detailed documentation.
 
-### Azure Document Intelligence
-
-To use Microsoft Document Intelligence for conversion:
-
-```bash
-markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
-```
-
-More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
-
 ### Azure Content Understanding
 
-[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and configurable analyzers.
+[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) is the next iteration of Azure Document Intelligence and is the recommended cloud option for new projects. It provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and [prebuilt](https://learn.microsoft.com/azure/ai-services/content-understanding/concepts/prebuilt-analyzers) or [custom-built](https://learn.microsoft.com/azure/ai-services/content-understanding/how-to/customize-analyzer-content-understanding-studio?tabs=portal) analyzers.
 
 Install: `pip install 'markitdown[az-content-understanding]'`
 
@@ -246,6 +236,16 @@ md = MarkItDown(
 
 More information about Azure Content Understanding can be found [here](https://learn.microsoft.com/azure/ai-services/content-understanding/).
 
+### Azure Document Intelligence
+
+To use Microsoft Document Intelligence for conversion:
+
+```bash
+markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
+```
+
+More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
+
 ### Python API
 
 Basic usage in Python:
diff --git a/packages/markitdown/src/markitdown/converters/_cu_converter.py b/packages/markitdown/src/markitdown/converters/_cu_converter.py
index d3c70494f..e4080dda7 100644
--- a/packages/markitdown/src/markitdown/converters/_cu_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_cu_converter.py
@@ -293,15 +293,30 @@ def _content_type_for(
     file_type: ContentUnderstandingFileType,
     mimetype: Optional[str],
 ) -> str:
-    content_type = _canonical_mime_type(mimetype)
-    if content_type != "application/octet-stream":
-        return content_type
-
+    """Resolve the content type to send to the CU API.
+
+    Uses the resolved ``file_type`` as the source of truth so analyzer
+    routing and payload metadata stay consistent. The caller-provided
+    ``mimetype`` is only used when it is consistent with ``file_type``
+    (e.g., to preserve subtype distinctions like ``image/heic`` vs
+    ``image/heif``). When ``mimetype`` disagrees with the resolved
+    ``file_type`` (e.g., ``.pdf`` extension with ``audio/mpeg``
+    mimetype), the canonical MIME type for ``file_type`` is used.
+    """
     prefixes = _MIME_PREFIXES.get(file_type, [])
-    if not prefixes:
-        return content_type
+    canonical = _canonical_mime_type(mimetype)
+
+    # Use caller-provided MIME if it's consistent with the resolved file_type
+    if prefixes and canonical != "application/octet-stream":
+        for prefix in prefixes:
+            if canonical.startswith(prefix):
+                return canonical
+
+    # Fallback: derive from the resolved file_type (single source of truth)
+    if prefixes:
+        return _canonical_mime_type(prefixes[0])
 
-    return _canonical_mime_type(prefixes[0])
+    return canonical
 
 
 def _detect_file_type_from_mime(
diff --git a/packages/markitdown/tests/test_cu_converter.py b/packages/markitdown/tests/test_cu_converter.py
index 0cfd31aac..760f5fcc7 100644
--- a/packages/markitdown/tests/test_cu_converter.py
+++ b/packages/markitdown/tests/test_cu_converter.py
@@ -22,7 +22,6 @@
 )
 from markitdown._stream_info import StreamInfo
 
-
 # ---------------------------------------------------------------------------
 # Helper: create a converter with accepts() working but no SDK init
 # ---------------------------------------------------------------------------
@@ -248,6 +247,54 @@ def test_canonical_mime_type(self, mimetype, expected):
     def test_content_type_for(self, file_type, mimetype, expected):
         assert _content_type_for(file_type, mimetype) == expected
 
+    @pytest.mark.parametrize(
+        ("file_type", "mimetype", "expected"),
+        [
+            # Extension/file_type wins when mimetype disagrees — the
+            # resolved file_type is the single source of truth so that
+            # analyzer routing and payload metadata stay consistent.
+            (ContentUnderstandingFileType.PDF, "audio/mpeg", "application/pdf"),
+            (ContentUnderstandingFileType.MP3, "application/pdf", "audio/mpeg"),
+            (ContentUnderstandingFileType.MP4, "image/jpeg", "video/mp4"),
+            (ContentUnderstandingFileType.JPEG, "video/mp4", "image/jpeg"),
+            # Subtype distinctions are preserved when consistent
+            # (e.g., HEIC vs HEIF both map to file_type HEIF; if the
+            # caller passed image/heic explicitly, keep it).
+            (ContentUnderstandingFileType.HEIF, "image/heic", "image/heic"),
+            (ContentUnderstandingFileType.HEIF, "image/heif", "image/heif"),
+        ],
+    )
+    def test_content_type_for_resolves_conflicts_to_file_type(
+        self, file_type, mimetype, expected
+    ):
+        """When extension and mimetype disagree, file_type wins."""
+        assert _content_type_for(file_type, mimetype) == expected
+
+    def test_conflicting_extension_and_mimetype_in_convert(self):
+        """End-to-end: conflicting StreamInfo routes by extension and
+        sends a content_type consistent with the resolved file_type."""
+        conv = _make_converter()
+        conv._client = MagicMock()
+        mock_poller = MagicMock()
+        mock_poller.result.return_value = MagicMock(contents=[])
+        conv._client.begin_analyze_binary.return_value = mock_poller
+
+        with patch(
+            "markitdown.converters._cu_converter.to_llm_input",
+            return_value="ok",
+        ):
+            conv.convert(
+                io.BytesIO(b"fake"),
+                # .pdf extension but bogus audio mimetype
+                StreamInfo(extension=".pdf", mimetype="audio/mpeg"),
+            )
+
+        call_kwargs = conv._client.begin_analyze_binary.call_args.kwargs
+        # Routed by extension: document modality → prebuilt-documentSearch
+        assert call_kwargs["analyzer_id"] == "prebuilt-documentSearch"
+        # content_type derived from file_type (PDF), not the conflicting mime
+        assert call_kwargs["content_type"] == "application/pdf"
+
     def test_file_type_restriction_applies_to_mime(self):
         assert (
             _detect_file_type(

From 2ed5af7e703107b15786707d938c48d0fc9e521b Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Thu, 7 May 2026 17:02:14 -0700
Subject: [PATCH 9/9] update readme

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 12bbe3ad9..aa2f58bb8 100644
--- a/README.md
+++ b/README.md
@@ -161,7 +161,7 @@ See [`packages/markitdown-ocr/README.md`](packages/markitdown-ocr/README.md) for
 
 ### Azure Content Understanding
 
-[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) is the next iteration of Azure Document Intelligence and is the recommended cloud option for new projects. It provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and [prebuilt](https://learn.microsoft.com/azure/ai-services/content-understanding/concepts/prebuilt-analyzers) or [custom-built](https://learn.microsoft.com/azure/ai-services/content-understanding/how-to/customize-analyzer-content-understanding-studio?tabs=portal) analyzers.
+[Azure Content Understanding](https://learn.microsoft.com/azure/ai-services/content-understanding/) provides higher-quality conversion with structured field extraction (YAML front matter), multi-modal support (documents, images, audio, video), and configurable analyzers.
 
 Install: `pip install 'markitdown[az-content-understanding]'`
 
@@ -170,7 +170,7 @@ Install: `pip install 'markitdown[az-content-understanding]'`
 Content Understanding is ideal when you need capabilities beyond what built-in or Document Intelligence converters provide:
 
 - **Audio and video files** — CU is the only option for video, and the higher-quality cloud option for audio. Built-in converters have no video support and only basic audio transcription.
-- **Structured field extraction** — Custom analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields.
+- **Structured field extraction** — [Prebuilt](https://learn.microsoft.com/azure/ai-services/content-understanding/concepts/prebuilt-analyzers) or [custom-built](https://learn.microsoft.com/azure/ai-services/content-understanding/how-to/customize-analyzer-content-understanding-studio?tabs=portal) analyzers extract domain-specific fields (invoice amounts, receipt dates, contract clauses) serialized as YAML front matter. Neither built-in nor Doc Intel integration exposes fields.
 - **Higher-quality document extraction** — Cloud-based layout analysis and OCR for scanned PDFs, complex tables, and multi-page documents.
 - **Single API for all modalities** — One `cu_endpoint` handles documents, images, audio, and video with automatic analyzer routing.