From 01cf959c2707817abc2f11600a93e2a4d5e8f939 Mon Sep 17 00:00:00 2001
From: haosenwang1018 <1293965075@qq.com>
Date: Wed, 6 May 2026 19:02:04 +0800
Subject: [PATCH] fix(docx): repair zip local headers that disagree with
 central directory in case only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #1812

Some .docx producers (notably certain legal-document systems and some
older Microsoft Word builds) emit zip files where the local file
header name and the central-directory name differ in case only — for
example, ``customXml/item2.xml`` in the central directory but
``customXML/item2.xml`` in the local file header.

Most zip tools accept this. Python's ``zipfile`` strictly validates
and raises ``BadZipFile`` mid-conversion, surfaced to the user as::

    DocxConverter threw BadZipFile with message:
      File name in directory 'customXml/item2.xml' and header
      b'customXML/item2.xml' differ.

Per APPNOTE the central directory is authoritative. Add
``_fix_zip_name_casing`` that scans local file headers, finds entries
that match the central name when lower-cased and have the same byte
length (always true for ASCII case-only mismatches), and rewrites the
header bytes in-memory to match. The patch is byte-length preserving,
so no offset recomputation is needed. Call this from the start of
``pre_process_docx`` so every code path that runs through it benefits.

If the central directory itself is unparseable we leave the stream
alone so the caller surfaces the same error the unfixed code would
have — no silent data loss.

Tests:

- ``test_setup_actually_reproduces_badzipfile`` is a guard that the
  test fixture really does trip ``BadZipFile``; if Python ever stops
  validating local-vs-central name parity, the regression below would
  pass for the wrong reason.
- ``test_fix_zip_name_casing_repairs_mismatched_local_header`` and
  ``test_pre_process_docx_accepts_case_mismatched_archive`` cover the
  fix directly.
- ``test_fix_zip_name_casing_passes_through_normal_archive`` is a
  regression guard against rewriting well-formed inputs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../converter_utils/docx/pre_process.py       |  63 +++++++++
 .../tests/test_docx_zip_case_mismatch.py      | 127 ++++++++++++++++++
 2 files changed, 190 insertions(+)
 create mode 100644 packages/markitdown/tests/test_docx_zip_case_mismatch.py

diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
index d6fa8db69..13c6e3d00 100644
--- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -1,3 +1,4 @@
+import struct
 import zipfile
 from io import BytesIO
 from typing import BinaryIO
@@ -115,6 +116,63 @@ def _pre_process_math(content: bytes) -> bytes:
     return str(soup).encode()
 
 
+def _fix_zip_name_casing(input_docx: BinaryIO) -> BinaryIO:
+    """Repair .docx zips whose local file headers disagree with the central
+    directory only in case (e.g. ``customXML/item2.xml`` locally vs
+    ``customXml/item2.xml`` in the central directory).
+
+    Some .docx producers emit such files. Most zip tools accept them, but
+    Python's :mod:`zipfile` strictly validates and raises ``BadZipFile``.
+    The central directory is authoritative per APPNOTE; we patch the local
+    file header bytes in-memory to match it. The patch is byte-length
+    preserving (case-only differences in ASCII paths never change length),
+    so no offset recomputation is needed.
+
+    If no fix is required the original stream is returned unchanged
+    (rewound to position 0). See markitdown #1812.
+    """
+    input_docx.seek(0)
+    raw = bytearray(input_docx.read())
+    patched = False
+    try:
+        zf = zipfile.ZipFile(BytesIO(raw), mode="r")
+    except zipfile.BadZipFile:
+        # Not even the central directory parses — nothing we can patch here.
+        # Let the caller see the same error the unfixed code path would.
+        input_docx.seek(0)
+        return input_docx
+    try:
+        for info in zf.infolist():
+            offset = info.header_offset
+            # Local file header signature is "PK\x03\x04". If we don't see
+            # it we have a malformed zip beyond what this fixer targets.
+            if raw[offset : offset + 4] != b"PK\x03\x04":
+                continue
+            # Per APPNOTE local file header layout:
+            #   bytes  0..3  signature
+            #   bytes 26..27 file name length (little-endian uint16)
+            #   bytes 30..   file name
+            fname_len = struct.unpack_from("<H", raw, offset + 26)[0]
+            local_name = bytes(raw[offset + 30 : offset + 30 + fname_len])
+            try:
+                central_name = info.filename.encode("utf-8")
+            except UnicodeEncodeError:
+                continue
+            if (
+                local_name != central_name
+                and local_name.lower() == central_name.lower()
+                and len(local_name) == len(central_name)
+            ):
+                raw[offset + 30 : offset + 30 + fname_len] = central_name
+                patched = True
+    finally:
+        zf.close()
+    if patched:
+        return BytesIO(bytes(raw))
+    input_docx.seek(0)
+    return input_docx
+
+
 def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     """
     Pre-processes a DOCX file with provided steps.
@@ -129,6 +187,11 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     Returns:
         BinaryIO: A binary output stream representing the processed DOCX file.
     """
+    # Repair .docx zips whose local headers disagree with the central
+    # directory only in case — Python's zipfile rejects these but most other
+    # zip tools accept them, so .docx producers occasionally emit them
+    # (markitdown #1812).
+    input_docx = _fix_zip_name_casing(input_docx)
     output_docx = BytesIO()
     # The files that need to be pre-processed from .docx
     pre_process_enable_files = [
diff --git a/packages/markitdown/tests/test_docx_zip_case_mismatch.py b/packages/markitdown/tests/test_docx_zip_case_mismatch.py
new file mode 100644
index 000000000..4c0e71248
--- /dev/null
+++ b/packages/markitdown/tests/test_docx_zip_case_mismatch.py
@@ -0,0 +1,127 @@
+"""Regression test for #1812.
+
+Some .docx producers emit zip files where local file header names disagree
+with the central-directory names in case only (e.g. ``customXML/item2.xml``
+locally, ``customXml/item2.xml`` centrally). Most zip tools accept this,
+but Python's :mod:`zipfile` raises ``BadZipFile``. ``pre_process_docx``
+must repair the archive before opening so .docx conversion does not crash
+on these files.
+"""
+
+from __future__ import annotations
+
+import struct
+import zipfile
+from io import BytesIO
+
+import pytest
+
+from markitdown.converter_utils.docx.pre_process import (
+    _fix_zip_name_casing,
+    pre_process_docx,
+)
+
+
+_MINIMAL_DOCUMENT_XML = (
+    b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+    b'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
+    b"<w:body><w:p><w:r><w:t>hello world</w:t></w:r></w:p></w:body>"
+    b"</w:document>"
+)
+
+
+def _build_minimal_docx_bytes(local_name_override: dict[str, str] | None = None) -> bytes:
+    """Build a minimal .docx zip in memory.
+
+    If ``local_name_override`` is supplied, after writing the archive the
+    local file header bytes for the listed entry are patched in place so
+    the local name differs from the central-directory name (case only —
+    same byte length).
+    """
+    buf = BytesIO()
+    with zipfile.ZipFile(buf, mode="w") as zf:
+        zf.writestr("word/document.xml", _MINIMAL_DOCUMENT_XML)
+        # An entry whose case will be munged by the override below.
+        zf.writestr("customXml/item2.xml", b"<root/>")
+    raw = bytearray(buf.getvalue())
+    if not local_name_override:
+        return bytes(raw)
+
+    # Patch local file headers in place. The central directory remains
+    # intact, so opening the resulting bytes with stock zipfile fails — the
+    # exact bug we want to reproduce.
+    info_offsets: dict[str, int] = {}
+    with zipfile.ZipFile(BytesIO(bytes(raw)), mode="r") as zf:
+        for info in zf.infolist():
+            info_offsets[info.filename] = info.header_offset
+    for central_name, replacement_local_name in local_name_override.items():
+        offset = info_offsets[central_name]
+        # Sanity: the local file header signature must be at this offset.
+        assert raw[offset : offset + 4] == b"PK\x03\x04"
+        fname_len = struct.unpack_from("<H", raw, offset + 26)[0]
+        new_bytes = replacement_local_name.encode("utf-8")
+        assert len(new_bytes) == fname_len, (
+            "test setup error: replacement name must match the original "
+            "byte length so the override is purely a case-flip"
+        )
+        raw[offset + 30 : offset + 30 + fname_len] = new_bytes
+    return bytes(raw)
+
+
+def test_setup_actually_reproduces_badzipfile():
+    """Sanity guard: without the fix, the constructed bytes raise BadZipFile.
+
+    If this test ever passes silently (i.e. zipfile stops validating local-
+    vs-central name mismatches), the regression below would also pass for
+    the wrong reason and we'd lose coverage of the original bug.
+    """
+    bad = _build_minimal_docx_bytes(
+        local_name_override={"customXml/item2.xml": "customXML/item2.xml"}
+    )
+    with pytest.raises(zipfile.BadZipFile):
+        with zipfile.ZipFile(BytesIO(bad), mode="r") as zf:
+            zf.read("customXml/item2.xml")
+
+
+def test_fix_zip_name_casing_repairs_mismatched_local_header():
+    """``_fix_zip_name_casing`` must rewrite local headers to match the
+    central directory so the archive becomes openable."""
+    bad = _build_minimal_docx_bytes(
+        local_name_override={"customXml/item2.xml": "customXML/item2.xml"}
+    )
+    fixed = _fix_zip_name_casing(BytesIO(bad))
+    # After repair the archive opens cleanly and the entry reads back.
+    with zipfile.ZipFile(fixed, mode="r") as zf:
+        assert zf.read("customXml/item2.xml") == b"<root/>"
+
+
+def test_fix_zip_name_casing_passes_through_normal_archive():
+    """A correctly-formed .docx must not be rewritten — return the same
+    stream rewound to the start so callers can keep using it."""
+    good = _build_minimal_docx_bytes(local_name_override=None)
+    src = BytesIO(good)
+    out = _fix_zip_name_casing(src)
+    # Either the same object or an equivalent BytesIO; either way the bytes
+    # we get back should match the input we started with.
+    out.seek(0)
+    assert out.read() == good
+
+
+def test_pre_process_docx_accepts_case_mismatched_archive():
+    """End-to-end regression for #1812: ``pre_process_docx`` must not
+    raise on a .docx whose local headers differ from the central directory
+    in case only, and the resulting archive must still contain the
+    pre-processed ``word/document.xml`` payload."""
+    bad = _build_minimal_docx_bytes(
+        local_name_override={"customXml/item2.xml": "customXML/item2.xml"}
+    )
+    out = pre_process_docx(BytesIO(bad))
+    out.seek(0)
+    with zipfile.ZipFile(out, mode="r") as zf:
+        # word/document.xml is one of the pre-processed files; it should be
+        # present and contain the original body text (no math substitution
+        # occurs on this minimal payload).
+        body = zf.read("word/document.xml")
+        assert b"hello world" in body
+        # The previously-mismatched entry survived too.
+        assert zf.read("customXml/item2.xml") == b"<root/>"