microsoft · haosenwang1018 · May 6, 2026
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -1,3 +1,4 @@
+import struct
 import zipfile
 from io import BytesIO
 from typing import BinaryIO
@@ -115,6 +116,63 @@ def _pre_process_math(content: bytes) -> bytes:
     return str(soup).encode()
 
 
+def _fix_zip_name_casing(input_docx: BinaryIO) -> BinaryIO:
+    """Repair .docx zips whose local file headers disagree with the central
+    directory only in case (e.g. ``customXML/item2.xml`` locally vs
+    ``customXml/item2.xml`` in the central directory).
+
+    Some .docx producers emit such files. Most zip tools accept them, but
+    Python's :mod:`zipfile` strictly validates and raises ``BadZipFile``.
+    The central directory is authoritative per APPNOTE; we patch the local
+    file header bytes in-memory to match it. The patch is byte-length
+    preserving (case-only differences in ASCII paths never change length),
+    so no offset recomputation is needed.
+
+    If no fix is required the original stream is returned unchanged
+    (rewound to position 0). See markitdown #1812.
+    """
+    input_docx.seek(0)
+    raw = bytearray(input_docx.read())
+    patched = False
+    try:
+        zf = zipfile.ZipFile(BytesIO(raw), mode="r")
+    except zipfile.BadZipFile:
+        # Not even the central directory parses — nothing we can patch here.
+        # Let the caller see the same error the unfixed code path would.
+        input_docx.seek(0)
+        return input_docx
+    try:
+        for info in zf.infolist():
+            offset = info.header_offset
+            # Local file header signature is "PK\x03\x04". If we don't see
+            # it we have a malformed zip beyond what this fixer targets.
+            if raw[offset : offset + 4] != b"PK\x03\x04":
+                continue
+            # Per APPNOTE local file header layout:
+            #   bytes  0..3  signature
+            #   bytes 26..27 file name length (little-endian uint16)
+            #   bytes 30..   file name
+            fname_len = struct.unpack_from("<H", raw, offset + 26)[0]
+            local_name = bytes(raw[offset + 30 : offset + 30 + fname_len])
+            try:
+                central_name = info.filename.encode("utf-8")
+            except UnicodeEncodeError:
+                continue
+            if (
+                local_name != central_name
+                and local_name.lower() == central_name.lower()
+                and len(local_name) == len(central_name)
+            ):
+                raw[offset + 30 : offset + 30 + fname_len] = central_name
+                patched = True
+    finally:
+        zf.close()
+    if patched:
+        return BytesIO(bytes(raw))
+    input_docx.seek(0)
+    return input_docx
+
+
 def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     """
     Pre-processes a DOCX file with provided steps.
@@ -129,6 +187,11 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     Returns:
         BinaryIO: A binary output stream representing the processed DOCX file.
     """
+    # Repair .docx zips whose local headers disagree with the central
+    # directory only in case — Python's zipfile rejects these but most other
+    # zip tools accept them, so .docx producers occasionally emit them
+    # (markitdown #1812).
+    input_docx = _fix_zip_name_casing(input_docx)
     output_docx = BytesIO()
     # The files that need to be pre-processed from .docx
     pre_process_enable_files = [

diff --git a/packages/markitdown/tests/test_docx_zip_case_mismatch.py b/packages/markitdown/tests/test_docx_zip_case_mismatch.py
@@ -0,0 +1,127 @@
+"""Regression test for #1812.
+
+Some .docx producers emit zip files where local file header names disagree
+with the central-directory names in case only (e.g. ``customXML/item2.xml``
+locally, ``customXml/item2.xml`` centrally). Most zip tools accept this,
+but Python's :mod:`zipfile` raises ``BadZipFile``. ``pre_process_docx``
+must repair the archive before opening so .docx conversion does not crash
+on these files.
+"""
+
+from __future__ import annotations
+
+import struct
+import zipfile
+from io import BytesIO
+
+import pytest
+
+from markitdown.converter_utils.docx.pre_process import (
+    _fix_zip_name_casing,
+    pre_process_docx,
+)
+
+
+_MINIMAL_DOCUMENT_XML = (
+    b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+    b'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
+    b"<w:body><w:p><w:r><w:t>hello world</w:t></w:r></w:p></w:body>"
+    b"</w:document>"
+)
+
+
+def _build_minimal_docx_bytes(local_name_override: dict[str, str] | None = None) -> bytes:
+    """Build a minimal .docx zip in memory.
+
+    If ``local_name_override`` is supplied, after writing the archive the
+    local file header bytes for the listed entry are patched in place so
+    the local name differs from the central-directory name (case only —
+    same byte length).
+    """
+    buf = BytesIO()
+    with zipfile.ZipFile(buf, mode="w") as zf:
+        zf.writestr("word/document.xml", _MINIMAL_DOCUMENT_XML)
+        # An entry whose case will be munged by the override below.
+        zf.writestr("customXml/item2.xml", b"<root/>")
+    raw = bytearray(buf.getvalue())
+    if not local_name_override:
+        return bytes(raw)
+
+    # Patch local file headers in place. The central directory remains
+    # intact, so opening the resulting bytes with stock zipfile fails — the
+    # exact bug we want to reproduce.
+    info_offsets: dict[str, int] = {}
+    with zipfile.ZipFile(BytesIO(bytes(raw)), mode="r") as zf:
+        for info in zf.infolist():
+            info_offsets[info.filename] = info.header_offset
+    for central_name, replacement_local_name in local_name_override.items():
+        offset = info_offsets[central_name]
+        # Sanity: the local file header signature must be at this offset.
+        assert raw[offset : offset + 4] == b"PK\x03\x04"
+        fname_len = struct.unpack_from("<H", raw, offset + 26)[0]
+        new_bytes = replacement_local_name.encode("utf-8")
+        assert len(new_bytes) == fname_len, (
+            "test setup error: replacement name must match the original "
+            "byte length so the override is purely a case-flip"
+        )
+        raw[offset + 30 : offset + 30 + fname_len] = new_bytes
+    return bytes(raw)
+
+
+def test_setup_actually_reproduces_badzipfile():
+    """Sanity guard: without the fix, the constructed bytes raise BadZipFile.
+
+    If this test ever passes silently (i.e. zipfile stops validating local-
+    vs-central name mismatches), the regression below would also pass for
+    the wrong reason and we'd lose coverage of the original bug.
+    """
+    bad = _build_minimal_docx_bytes(
+        local_name_override={"customXml/item2.xml": "customXML/item2.xml"}
+    )
+    with pytest.raises(zipfile.BadZipFile):
+        with zipfile.ZipFile(BytesIO(bad), mode="r") as zf:
+            zf.read("customXml/item2.xml")
+
+
+def test_fix_zip_name_casing_repairs_mismatched_local_header():
+    """``_fix_zip_name_casing`` must rewrite local headers to match the
+    central directory so the archive becomes openable."""
+    bad = _build_minimal_docx_bytes(
+        local_name_override={"customXml/item2.xml": "customXML/item2.xml"}
+    )
+    fixed = _fix_zip_name_casing(BytesIO(bad))
+    # After repair the archive opens cleanly and the entry reads back.
+    with zipfile.ZipFile(fixed, mode="r") as zf:
+        assert zf.read("customXml/item2.xml") == b"<root/>"
+
+
+def test_fix_zip_name_casing_passes_through_normal_archive():
+    """A correctly-formed .docx must not be rewritten — return the same
+    stream rewound to the start so callers can keep using it."""
+    good = _build_minimal_docx_bytes(local_name_override=None)
+    src = BytesIO(good)
+    out = _fix_zip_name_casing(src)
+    # Either the same object or an equivalent BytesIO; either way the bytes
+    # we get back should match the input we started with.
+    out.seek(0)
+    assert out.read() == good
+
+
+def test_pre_process_docx_accepts_case_mismatched_archive():
+    """End-to-end regression for #1812: ``pre_process_docx`` must not
+    raise on a .docx whose local headers differ from the central directory
+    in case only, and the resulting archive must still contain the
+    pre-processed ``word/document.xml`` payload."""
+    bad = _build_minimal_docx_bytes(
+        local_name_override={"customXml/item2.xml": "customXML/item2.xml"}
+    )
+    out = pre_process_docx(BytesIO(bad))
+    out.seek(0)
+    with zipfile.ZipFile(out, mode="r") as zf:
+        # word/document.xml is one of the pre-processed files; it should be
+        # present and contain the original body text (no math substitution
+        # occurs on this minimal payload).
+        body = zf.read("word/document.xml")
+        assert b"hello world" in body
+        # The previously-mismatched entry survived too.
+        assert zf.read("customXml/item2.xml") == b"<root/>"