From adf68e752dac4da39642e99180476d011fdb0ed5 Mon Sep 17 00:00:00 2001 From: yousrae2004 <183547839+yousrae2004@users.noreply.github.com> Date: Fri, 8 May 2026 19:51:41 -0700 Subject: [PATCH 1/3] feat: add DocConverter for legacy .doc files using unword parser --- .../markitdown/src/markitdown/_markitdown.py | 2 + .../src/markitdown/converters/__init__.py | 2 + .../markitdown/converters/_doc_converter.py | 73 ++++++++++++++++++ packages/markitdown/tests/_test_vectors.py | 13 ++++ packages/markitdown/tests/test_files/test.doc | Bin 0 -> 9216 bytes 5 files changed, 90 insertions(+) create mode 100644 packages/markitdown/src/markitdown/converters/_doc_converter.py create mode 100644 packages/markitdown/tests/test_files/test.doc diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..614022660 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -29,6 +29,7 @@ BingSerpConverter, PdfConverter, DocxConverter, + DocConverter, XlsxConverter, XlsConverter, PptxConverter, @@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) + self.register_converter(DocConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..56475bdb6 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -11,6 +11,7 @@ from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter +from ._doc_converter import DocConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter @@ -34,6 +35,7 @@ "BingSerpConverter", "PdfConverter", "DocxConverter", + "DocConverter", "XlsxConverter", "XlsConverter", "PptxConverter", diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py new file mode 100644 index 000000000..626894516 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py @@ -0,0 +1,73 @@ +import sys +#import re +from typing import Any, BinaryIO + +from .._stream_info import StreamInfo +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +#using unword dependency +_dependency_exc_info = None +unword = None +try: + import unword as _unword + unword = _unword +except ImportError: + _dependency_exc_info = sys.exc_info() + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/msword", "application/x-msword" +] + +ACCEPTED_FILE_EXTENSIONS = [".doc"] + + +class DocConverter(DocumentConverter): + """ + Converts DOC (Word 97-2003) files to Markdown. Uses unword package + as parser backendto extract body text with heading levels, + page breaks, and textbox contents. + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Check: the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".doc", + feature="doc", + ) + ) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + assert unword is not None + + # parse_doc takes raw bytes and returns a Document object + doc = unword.parse_doc(file_stream.read()) + + + return DocumentConverterResult(markdown=doc.body_text.strip(), title=None) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..2aef247af 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -31,6 +31,19 @@ class FileTestVector(object): "data:image/png;base64,iVBORw0KGgoAAAANSU", ], ), + FileTestVector( + filename="test.doc", + mimetype="application/msword", + charset=None, + url=None, + must_include=[ + "93d437af-bc31-492f-a7fc-3cbc9b7c1710", + "fd0ed3e3-6373-4446-815c-4b979f6063a9", + "#Test for converting .doc files to MD format", + "Let’s test it!", + ], + must_not_include=[], + ), FileTestVector( filename="test.xlsx", mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", diff --git a/packages/markitdown/tests/test_files/test.doc b/packages/markitdown/tests/test_files/test.doc new file mode 100644 index 0000000000000000000000000000000000000000..cbba9394796411ff8e45255ee544008eae6ef9d3 GIT binary patch literal 9216 zcmeI1O>9(E6vywIPCso+nbyje@+c!9(3XBsT0w2g0Hsh`D1wS&rkw(#bOxt`LgE57 z8W)gYFq#;nF@dW{xwf z?x_}!N$=84%k=faP5oST^71DwkjqvoPq%++{hsdkuYEw@|6Je$)u0C44QjzXU>=wc z7J!9d5x5sf)?!jSnXe;V3jE-Huna5*E5HL_C0GSkgL=>a8o?US1e!q$Xa#FQ8(0V0 zK?isctOpywL*QZX2d23tT6*a~{VHn1J^fgPY941k?r7Z?P)!J}Xg z*b5#5j{}W&UoxGXso9aU(5`?5UfL9uYDYKbOXq1eYySGe!?EaC^g!I+8;ynP1JP%X zj*dj)&YsXWC|)2s?0ACm4ZJwz4ejQSj!&OUbabmH{+AGXt4g;h^Wk-6oI>_Zo=nVs zT=TaHk5cxs9jtFLGirk7Fz?#On*7#Eztx4iwV3XeroXqq>;xNo=b6!-Qtuep*5fe) zy_f%{k=s9FGBbt1Qwcvphfs{ca{2)PnY7MQF=Mf7qgolkFiec>#H)pP>p5LBj4WH zlkjZQSY=s0%O>x|U&Ch1sq5i8_0o4SK3_+_byg=eip?=&7?XlY^7bu(MfOBPOF3@ip^^Pd+EU7c$kh9WgIDV%22+X ze39~y>WVyCRzza(iBT=_*iw5MZ^=!&8O|swjbV-*q%V+ao0AJ@kZbA3jn55Y@KG~F zOF2%s(`6UxcI_QdB(XO!PrV ziX>>++v>9IifT6?>|)a(HkXZ)$-3SOScz+ zR!o)ae8U22I)P#|olmsC>D*BcLO_mt1W3ZqH;lP2nb*0g6P&oR&rjmym5MsSTP5$k zwCDRP&OUMS%%yY6t1Od#IcHgYb}1jgHQyJB!h{Kvt9zx{7>phcM%t$^Y^k@_G&*}j zqG4(W$dr+KpO~2A>>N@=`IH@$&!pEK#yd%!Q)yRsV_i zfXw|5>1+C#39Vu-hSKDJM?X51=jp#`0r`pi@OG2OT)y2R^YZc*$Xj4mS)hX$Iz+VE z!i`IiP^F$gW|+4oa;@CHG&{T?r3c{GQ!?z-=-|1Id=t+mZgMIp50Q@WY~k5L-8P=< zoLnn?wenuinWu?69`)2p4;_?u(6Wt|svYF5g1aO|$%=e!$`t+Ev`~cmJp7mw{3sr` zxk(DaA9m`63G)&`V;r7Bnt;Pk-5$j0c{?@YX1=Y^GX9s0F!?3UNo!V3&FxW!bDkpl zJpC^%VCj0M7rY661-j9>o^A@4FX>J4`Ppm%qOJnJ3HnVJfupzHY*=GD2e?EZ@R-FH zE;ia1u0FKnLd07KmY$Ay{h)>N70sq{iWo3k{Hp-QbLpE)?=~!~I`cO1@9OL41Mmtn zcu&I9J++lY6{*@P*3HJJI8R9n{G0#(X$~^~Lzv3Y1!C$CBFN$jQdhG__(x^PJd3T|+Cr}|5w@0zRF30uXukP~`XC`q{m(hv3Q$1H8U$^<{&aHgiL4BO;Y%jAP;}jNf z&SBDI1P%26hC5m?gYhq+>(sqd;Z2gKXOH$7$bSO)?njv04$j0UAGvG){r}%T0j%!^ AEdT%j literal 0 HcmV?d00001 From 5890fb53683e9cea791b699f96eb22990cfe48b7 Mon Sep 17 00:00:00 2001 From: yousrae2004 <183547839+yousrae2004@users.noreply.github.com> Date: Sat, 16 May 2026 15:36:01 -0700 Subject: [PATCH 2/3] feat: extract and save PPTX images to disk for issue #56 --- .../src/markitdown/converters/_pptx_converter.py | 13 +++++++++++-- packages/markitdown/tests/_test_vectors.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 360f17706..95ecbe91e 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -147,8 +147,17 @@ def get_shape_content(shape, **kwargs): b64_string = base64.b64encode(blob).decode("utf-8") md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" else: - # A placeholder name - filename = re.sub(r"\W", "", shape.name) + ".jpg" + #save image to disk to reference + content_type = shape.image.content_type or "image/png" + ext = content_type.split("/")[-1] + filename = re.sub(r"\W", "", shape.name) + "." + ext + + #write image to same directory as output + output_dir = kwargs.get("output_dir", ".") + image_path = os.path.join(output_dir, filename) + with open(image_path, "wb") as img_file: + img_file.write(shape.image.blob) + md_content += "\n![" + alt_text + "](" + filename + ")\n" # Tables diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 2aef247af..939d71ef3 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -81,7 +81,7 @@ class FileTestVector(object): "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value - "![This phrase of the caption is Human-written.](Picture4.jpg)", + "![This phrase of the caption is Human-written.](Picture4.jpeg)", ], must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"], ), From 3ad7625ae5a625646a30f4f2db3fd23c3abdf2f3 Mon Sep 17 00:00:00 2001 From: yousrae2004 <183547839+yousrae2004@users.noreply.github.com> Date: Sat, 16 May 2026 20:57:14 -0700 Subject: [PATCH 3/3] apply review suggestions for doc and pptx conversion code --- .../markitdown/converters/_doc_converter.py | 16 ++++++------- .../markitdown/converters/_pptx_converter.py | 24 +++++++++++++++---- packages/markitdown/tests/_test_vectors.py | 7 ++++-- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py index 626894516..cb4a71c0c 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py @@ -1,10 +1,9 @@ import sys -#import re from typing import Any, BinaryIO from .._stream_info import StreamInfo from .._base_converter import DocumentConverter, DocumentConverterResult -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, FileConversionException #using unword dependency _dependency_exc_info = None @@ -25,7 +24,7 @@ class DocConverter(DocumentConverter): """ Converts DOC (Word 97-2003) files to Markdown. Uses unword package - as parser backendto extract body text with heading levels, + as parser backend to extract body text with heading levels, page breaks, and textbox contents. """ @@ -64,10 +63,11 @@ def convert( ) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] ) - assert unword is not None - - # parse_doc takes raw bytes and returns a Document object - doc = unword.parse_doc(file_stream.read()) + try: + doc = unword.parse_doc(file_stream.read()) + except Exception as e: + raise FileConversionException(f"Failed to parse .doc file: {e}") from e - return DocumentConverterResult(markdown=doc.body_text.strip(), title=None) + title = getattr(doc, "title", None) or getattr(doc, "metadata", {}).get("title") + return DocumentConverterResult(markdown=doc.body_text.strip(), title=title) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 95ecbe91e..22abadd39 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -4,6 +4,7 @@ import io import re import html +import hashlib from typing import BinaryIO, Any from operator import attrgetter @@ -140,6 +141,8 @@ def get_shape_content(shape, **kwargs): alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"\s+", " ", alt_text).strip() + output_dir = kwargs.get("output_dir") + # If keep_data_uris is True, use base64 encoding for images if kwargs.get("keep_data_uris", False): blob = shape.image.blob @@ -148,17 +151,28 @@ def get_shape_content(shape, **kwargs): md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" else: #save image to disk to reference + blob = shape.image.blob content_type = shape.image.content_type or "image/png" - ext = content_type.split("/")[-1] - filename = re.sub(r"\W", "", shape.name) + "." + ext + ext_map = {"jpeg": "jpg", "svg+xml": "svg"} + raw_ext = content_type.split("/")[-1] + ext = ext_map.get(raw_ext, raw_ext) + + #add filename collision handling + suffix = hashlib.md5(blob).hexdigest()[:8] + safe_name = re.sub(r"\W", "", shape.name) if shape.name else "image" + filename = f"{safe_name}_{suffix}.{ext}" - #write image to same directory as output output_dir = kwargs.get("output_dir", ".") image_path = os.path.join(output_dir, filename) - with open(image_path, "wb") as img_file: - img_file.write(shape.image.blob) + + try: + with open(image_path, "wb") as img_file: + img_file.write(blob) + except OSError as e: + raise OSError(f"Failed to write image to '{image_path}'.") from e md_content += "\n![" + alt_text + "](" + filename + ")\n" + # Tables if self._is_table(shape): diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 939d71ef3..82b824cbb 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -42,7 +42,10 @@ class FileTestVector(object): "#Test for converting .doc files to MD format", "Let’s test it!", ], - must_not_include=[], + must_not_include=[ + "d0cf11e0", + "\x00" + ], ), FileTestVector( filename="test.xlsx", @@ -81,7 +84,7 @@ class FileTestVector(object): "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value - "![This phrase of the caption is Human-written.](Picture4.jpeg)", + "![This phrase of the caption is Human-written.](Picture4_", ], must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"], ),