diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..13c6e3d00 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -1,3 +1,4 @@ +import struct import zipfile from io import BytesIO from typing import BinaryIO @@ -115,6 +116,63 @@ def _pre_process_math(content: bytes) -> bytes: return str(soup).encode() +def _fix_zip_name_casing(input_docx: BinaryIO) -> BinaryIO: + """Repair .docx zips whose local file headers disagree with the central + directory only in case (e.g. ``customXML/item2.xml`` locally vs + ``customXml/item2.xml`` in the central directory). + + Some .docx producers emit such files. Most zip tools accept them, but + Python's :mod:`zipfile` strictly validates and raises ``BadZipFile``. + The central directory is authoritative per APPNOTE; we patch the local + file header bytes in-memory to match it. The patch is byte-length + preserving (case-only differences in ASCII paths never change length), + so no offset recomputation is needed. + + If no fix is required the original stream is returned unchanged + (rewound to position 0). See markitdown #1812. + """ + input_docx.seek(0) + raw = bytearray(input_docx.read()) + patched = False + try: + zf = zipfile.ZipFile(BytesIO(raw), mode="r") + except zipfile.BadZipFile: + # Not even the central directory parses — nothing we can patch here. + # Let the caller see the same error the unfixed code path would. + input_docx.seek(0) + return input_docx + try: + for info in zf.infolist(): + offset = info.header_offset + # Local file header signature is "PK\x03\x04". If we don't see + # it we have a malformed zip beyond what this fixer targets. + if raw[offset : offset + 4] != b"PK\x03\x04": + continue + # Per APPNOTE local file header layout: + # bytes 0..3 signature + # bytes 26..27 file name length (little-endian uint16) + # bytes 30.. file name + fname_len = struct.unpack_from(" BinaryIO: """ Pre-processes a DOCX file with provided steps. @@ -129,6 +187,11 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: Returns: BinaryIO: A binary output stream representing the processed DOCX file. """ + # Repair .docx zips whose local headers disagree with the central + # directory only in case — Python's zipfile rejects these but most other + # zip tools accept them, so .docx producers occasionally emit them + # (markitdown #1812). + input_docx = _fix_zip_name_casing(input_docx) output_docx = BytesIO() # The files that need to be pre-processed from .docx pre_process_enable_files = [ diff --git a/packages/markitdown/tests/test_docx_zip_case_mismatch.py b/packages/markitdown/tests/test_docx_zip_case_mismatch.py new file mode 100644 index 000000000..4c0e71248 --- /dev/null +++ b/packages/markitdown/tests/test_docx_zip_case_mismatch.py @@ -0,0 +1,127 @@ +"""Regression test for #1812. + +Some .docx producers emit zip files where local file header names disagree +with the central-directory names in case only (e.g. ``customXML/item2.xml`` +locally, ``customXml/item2.xml`` centrally). Most zip tools accept this, +but Python's :mod:`zipfile` raises ``BadZipFile``. ``pre_process_docx`` +must repair the archive before opening so .docx conversion does not crash +on these files. +""" + +from __future__ import annotations + +import struct +import zipfile +from io import BytesIO + +import pytest + +from markitdown.converter_utils.docx.pre_process import ( + _fix_zip_name_casing, + pre_process_docx, +) + + +_MINIMAL_DOCUMENT_XML = ( + b'' + b'' + b"hello world" + b"" +) + + +def _build_minimal_docx_bytes(local_name_override: dict[str, str] | None = None) -> bytes: + """Build a minimal .docx zip in memory. + + If ``local_name_override`` is supplied, after writing the archive the + local file header bytes for the listed entry are patched in place so + the local name differs from the central-directory name (case only — + same byte length). + """ + buf = BytesIO() + with zipfile.ZipFile(buf, mode="w") as zf: + zf.writestr("word/document.xml", _MINIMAL_DOCUMENT_XML) + # An entry whose case will be munged by the override below. + zf.writestr("customXml/item2.xml", b"") + raw = bytearray(buf.getvalue()) + if not local_name_override: + return bytes(raw) + + # Patch local file headers in place. The central directory remains + # intact, so opening the resulting bytes with stock zipfile fails — the + # exact bug we want to reproduce. + info_offsets: dict[str, int] = {} + with zipfile.ZipFile(BytesIO(bytes(raw)), mode="r") as zf: + for info in zf.infolist(): + info_offsets[info.filename] = info.header_offset + for central_name, replacement_local_name in local_name_override.items(): + offset = info_offsets[central_name] + # Sanity: the local file header signature must be at this offset. + assert raw[offset : offset + 4] == b"PK\x03\x04" + fname_len = struct.unpack_from("" + + +def test_fix_zip_name_casing_passes_through_normal_archive(): + """A correctly-formed .docx must not be rewritten — return the same + stream rewound to the start so callers can keep using it.""" + good = _build_minimal_docx_bytes(local_name_override=None) + src = BytesIO(good) + out = _fix_zip_name_casing(src) + # Either the same object or an equivalent BytesIO; either way the bytes + # we get back should match the input we started with. + out.seek(0) + assert out.read() == good + + +def test_pre_process_docx_accepts_case_mismatched_archive(): + """End-to-end regression for #1812: ``pre_process_docx`` must not + raise on a .docx whose local headers differ from the central directory + in case only, and the resulting archive must still contain the + pre-processed ``word/document.xml`` payload.""" + bad = _build_minimal_docx_bytes( + local_name_override={"customXml/item2.xml": "customXML/item2.xml"} + ) + out = pre_process_docx(BytesIO(bad)) + out.seek(0) + with zipfile.ZipFile(out, mode="r") as zf: + # word/document.xml is one of the pre-processed files; it should be + # present and contain the original body text (no math substitution + # occurs on this minimal payload). + body = zf.read("word/document.xml") + assert b"hello world" in body + # The previously-mismatched entry survived too. + assert zf.read("customXml/item2.xml") == b""