From 01cf959c2707817abc2f11600a93e2a4d5e8f939 Mon Sep 17 00:00:00 2001 From: haosenwang1018 <1293965075@qq.com> Date: Wed, 6 May 2026 19:02:04 +0800 Subject: [PATCH] fix(docx): repair zip local headers that disagree with central directory in case only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #1812 Some .docx producers (notably certain legal-document systems and some older Microsoft Word builds) emit zip files where the local file header name and the central-directory name differ in case only — for example, ``customXml/item2.xml`` in the central directory but ``customXML/item2.xml`` in the local file header. Most zip tools accept this. Python's ``zipfile`` strictly validates and raises ``BadZipFile`` mid-conversion, surfaced to the user as:: DocxConverter threw BadZipFile with message: File name in directory 'customXml/item2.xml' and header b'customXML/item2.xml' differ. Per APPNOTE the central directory is authoritative. Add ``_fix_zip_name_casing`` that scans local file headers, finds entries that match the central name when lower-cased and have the same byte length (always true for ASCII case-only mismatches), and rewrites the header bytes in-memory to match. The patch is byte-length preserving, so no offset recomputation is needed. Call this from the start of ``pre_process_docx`` so every code path that runs through it benefits. If the central directory itself is unparseable we leave the stream alone so the caller surfaces the same error the unfixed code would have — no silent data loss. Tests: - ``test_setup_actually_reproduces_badzipfile`` is a guard that the test fixture really does trip ``BadZipFile``; if Python ever stops validating local-vs-central name parity, the regression below would pass for the wrong reason. - ``test_fix_zip_name_casing_repairs_mismatched_local_header`` and ``test_pre_process_docx_accepts_case_mismatched_archive`` cover the fix directly. - ``test_fix_zip_name_casing_passes_through_normal_archive`` is a regression guard against rewriting well-formed inputs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../converter_utils/docx/pre_process.py | 63 +++++++++ .../tests/test_docx_zip_case_mismatch.py | 127 ++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 packages/markitdown/tests/test_docx_zip_case_mismatch.py diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..13c6e3d00 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -1,3 +1,4 @@ +import struct import zipfile from io import BytesIO from typing import BinaryIO @@ -115,6 +116,63 @@ def _pre_process_math(content: bytes) -> bytes: return str(soup).encode() +def _fix_zip_name_casing(input_docx: BinaryIO) -> BinaryIO: + """Repair .docx zips whose local file headers disagree with the central + directory only in case (e.g. ``customXML/item2.xml`` locally vs + ``customXml/item2.xml`` in the central directory). + + Some .docx producers emit such files. Most zip tools accept them, but + Python's :mod:`zipfile` strictly validates and raises ``BadZipFile``. + The central directory is authoritative per APPNOTE; we patch the local + file header bytes in-memory to match it. The patch is byte-length + preserving (case-only differences in ASCII paths never change length), + so no offset recomputation is needed. + + If no fix is required the original stream is returned unchanged + (rewound to position 0). See markitdown #1812. + """ + input_docx.seek(0) + raw = bytearray(input_docx.read()) + patched = False + try: + zf = zipfile.ZipFile(BytesIO(raw), mode="r") + except zipfile.BadZipFile: + # Not even the central directory parses — nothing we can patch here. + # Let the caller see the same error the unfixed code path would. + input_docx.seek(0) + return input_docx + try: + for info in zf.infolist(): + offset = info.header_offset + # Local file header signature is "PK\x03\x04". If we don't see + # it we have a malformed zip beyond what this fixer targets. + if raw[offset : offset + 4] != b"PK\x03\x04": + continue + # Per APPNOTE local file header layout: + # bytes 0..3 signature + # bytes 26..27 file name length (little-endian uint16) + # bytes 30.. file name + fname_len = struct.unpack_from(" BinaryIO: """ Pre-processes a DOCX file with provided steps. @@ -129,6 +187,11 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: Returns: BinaryIO: A binary output stream representing the processed DOCX file. """ + # Repair .docx zips whose local headers disagree with the central + # directory only in case — Python's zipfile rejects these but most other + # zip tools accept them, so .docx producers occasionally emit them + # (markitdown #1812). + input_docx = _fix_zip_name_casing(input_docx) output_docx = BytesIO() # The files that need to be pre-processed from .docx pre_process_enable_files = [ diff --git a/packages/markitdown/tests/test_docx_zip_case_mismatch.py b/packages/markitdown/tests/test_docx_zip_case_mismatch.py new file mode 100644 index 000000000..4c0e71248 --- /dev/null +++ b/packages/markitdown/tests/test_docx_zip_case_mismatch.py @@ -0,0 +1,127 @@ +"""Regression test for #1812. + +Some .docx producers emit zip files where local file header names disagree +with the central-directory names in case only (e.g. ``customXML/item2.xml`` +locally, ``customXml/item2.xml`` centrally). Most zip tools accept this, +but Python's :mod:`zipfile` raises ``BadZipFile``. ``pre_process_docx`` +must repair the archive before opening so .docx conversion does not crash +on these files. +""" + +from __future__ import annotations + +import struct +import zipfile +from io import BytesIO + +import pytest + +from markitdown.converter_utils.docx.pre_process import ( + _fix_zip_name_casing, + pre_process_docx, +) + + +_MINIMAL_DOCUMENT_XML = ( + b'' + b'' + b"hello world" + b"" +) + + +def _build_minimal_docx_bytes(local_name_override: dict[str, str] | None = None) -> bytes: + """Build a minimal .docx zip in memory. + + If ``local_name_override`` is supplied, after writing the archive the + local file header bytes for the listed entry are patched in place so + the local name differs from the central-directory name (case only — + same byte length). + """ + buf = BytesIO() + with zipfile.ZipFile(buf, mode="w") as zf: + zf.writestr("word/document.xml", _MINIMAL_DOCUMENT_XML) + # An entry whose case will be munged by the override below. + zf.writestr("customXml/item2.xml", b"") + raw = bytearray(buf.getvalue()) + if not local_name_override: + return bytes(raw) + + # Patch local file headers in place. The central directory remains + # intact, so opening the resulting bytes with stock zipfile fails — the + # exact bug we want to reproduce. + info_offsets: dict[str, int] = {} + with zipfile.ZipFile(BytesIO(bytes(raw)), mode="r") as zf: + for info in zf.infolist(): + info_offsets[info.filename] = info.header_offset + for central_name, replacement_local_name in local_name_override.items(): + offset = info_offsets[central_name] + # Sanity: the local file header signature must be at this offset. + assert raw[offset : offset + 4] == b"PK\x03\x04" + fname_len = struct.unpack_from("" + + +def test_fix_zip_name_casing_passes_through_normal_archive(): + """A correctly-formed .docx must not be rewritten — return the same + stream rewound to the start so callers can keep using it.""" + good = _build_minimal_docx_bytes(local_name_override=None) + src = BytesIO(good) + out = _fix_zip_name_casing(src) + # Either the same object or an equivalent BytesIO; either way the bytes + # we get back should match the input we started with. + out.seek(0) + assert out.read() == good + + +def test_pre_process_docx_accepts_case_mismatched_archive(): + """End-to-end regression for #1812: ``pre_process_docx`` must not + raise on a .docx whose local headers differ from the central directory + in case only, and the resulting archive must still contain the + pre-processed ``word/document.xml`` payload.""" + bad = _build_minimal_docx_bytes( + local_name_override={"customXml/item2.xml": "customXML/item2.xml"} + ) + out = pre_process_docx(BytesIO(bad)) + out.seek(0) + with zipfile.ZipFile(out, mode="r") as zf: + # word/document.xml is one of the pre-processed files; it should be + # present and contain the original body text (no math substitution + # occurs on this minimal payload). + body = zf.read("word/document.xml") + assert b"hello world" in body + # The previously-mismatched entry survived too. + assert zf.read("customXml/item2.xml") == b""