From adf68e752dac4da39642e99180476d011fdb0ed5 Mon Sep 17 00:00:00 2001
From: yousrae2004 <183547839+yousrae2004@users.noreply.github.com>
Date: Fri, 8 May 2026 19:51:41 -0700
Subject: [PATCH 1/3] feat: add DocConverter for legacy .doc files using unword
 parser

---
 .../markitdown/src/markitdown/_markitdown.py  |   2 +
 .../src/markitdown/converters/__init__.py     |   2 +
 .../markitdown/converters/_doc_converter.py   |  73 ++++++++++++++++++
 packages/markitdown/tests/_test_vectors.py    |  13 ++++
 packages/markitdown/tests/test_files/test.doc | Bin 0 -> 9216 bytes
 5 files changed, 90 insertions(+)
 create mode 100644 packages/markitdown/src/markitdown/converters/_doc_converter.py
 create mode 100644 packages/markitdown/tests/test_files/test.doc

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f342a614b..614022660 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -29,6 +29,7 @@
     BingSerpConverter,
     PdfConverter,
     DocxConverter,
+    DocConverter,
     XlsxConverter,
     XlsConverter,
     PptxConverter,
@@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(YouTubeConverter())
             self.register_converter(BingSerpConverter())
             self.register_converter(DocxConverter())
+            self.register_converter(DocConverter())
             self.register_converter(XlsxConverter())
             self.register_converter(XlsConverter())
             self.register_converter(PptxConverter())
diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
index e4437a582..56475bdb6 100644
--- a/packages/markitdown/src/markitdown/converters/__init__.py
+++ b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -11,6 +11,7 @@
 from ._bing_serp_converter import BingSerpConverter
 from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
+from ._doc_converter import DocConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
 from ._image_converter import ImageConverter
@@ -34,6 +35,7 @@
     "BingSerpConverter",
     "PdfConverter",
     "DocxConverter",
+    "DocConverter",
     "XlsxConverter",
     "XlsConverter",
     "PptxConverter",
diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py
new file mode 100644
index 000000000..626894516
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py
@@ -0,0 +1,73 @@
+import sys
+#import re
+from typing import Any, BinaryIO
+
+from .._stream_info import StreamInfo
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+#using unword dependency
+_dependency_exc_info = None
+unword = None
+try:
+    import unword as _unword
+    unword = _unword
+except ImportError:
+    _dependency_exc_info = sys.exc_info()
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/msword", "application/x-msword"
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".doc"]
+
+
+class DocConverter(DocumentConverter):
+    """
+    Converts DOC (Word 97-2003) files to Markdown. Uses unword package 
+    as parser backendto extract body text with heading levels,
+    page breaks, and textbox contents.
+    """
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Check: the dependencies
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".doc",
+                    feature="doc",
+                )
+            ) from _dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
+                _dependency_exc_info[2]
+            )
+        assert unword is not None
+
+        # parse_doc takes raw bytes and returns a Document object
+        doc = unword.parse_doc(file_stream.read())
+        
+
+        return DocumentConverterResult(markdown=doc.body_text.strip(), title=None)
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
index 74fa9bd0a..2aef247af 100644
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -31,6 +31,19 @@ class FileTestVector(object):
             "data:image/png;base64,iVBORw0KGgoAAAANSU",
         ],
     ),
+    FileTestVector(
+        filename="test.doc",
+        mimetype="application/msword",
+        charset=None,
+        url=None,
+        must_include=[
+            "93d437af-bc31-492f-a7fc-3cbc9b7c1710",
+            "fd0ed3e3-6373-4446-815c-4b979f6063a9",
+            "#Test for converting .doc files to MD format",
+            "Let’s test it!",
+        ],
+        must_not_include=[],
+    ),
     FileTestVector(
         filename="test.xlsx",
         mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
diff --git a/packages/markitdown/tests/test_files/test.doc b/packages/markitdown/tests/test_files/test.doc
new file mode 100644
index 0000000000000000000000000000000000000000..cbba9394796411ff8e45255ee544008eae6ef9d3
GIT binary patch
literal 9216
zcmeI1O>9(E6vywIPCso+nbyje@+c!9(3XBsT0w2g0Hsh`D1wS&rkw(#bOxt`LgE57
z8W)gYFq#;nF@d<y5Y(L;-RRCpOjwYRhzny(j6vgu1+BmTn;AO}9p<$fQgqJb|L%R~
zew=g9J@4Lo?p*wK?$r-IS@46&p!KG}j8BxBqBOomeopGW$`}uM@x~`6CQ@v&dAfrv
za3iU!=FrO|76Q$AF(?70pbXpv%0UIF1geh<IO!cU+`Ot87T9H?{Nlzon@of>W{xwf
z?x_}!N$=84%k=faP5oST^71DwkjqvoPq%++{hsdkuYEw@|6Je$)u0C44QjzXU>=wc
z7J!9d5x5sf)?!jSnXe;V3jE-Huna5*E5HL_C0GSkgL=>a8o?US1e!q$Xa#FQ8(0V0
zK?isctOpywL*QZX2<QY|pc`xi0k8>d23tT6*a~{VHn1J^fgPY941k?r7Z?P)!J}Xg
z*b5#5j{}W&UoxGXso9aU(5`?5UfL9uYDYKbOXq1eYySGe!?EaC^g!I+8;ynP1JP%X
zj*dj)&YsXWC|)2s?0ACm4ZJwz4ejQSj!&OUbabmH{+AGXt4g;h^Wk-6oI>_Zo=nVs
zT=TaHk5cxs9jtFLGirk7Fz?#On*7#Eztx4iwV3XeroXqq>;xNo=b6!-Qtuep*5fe)
zy<QXP@s`BF{@y|pr2Ofg4(*cJX|ow$Y#EOwA1cO!MvxmaVb<1za4OKU`s_7RbM}}i
z@>_f%{k=s9FGBbt1Qwcvphfs{ca{2)PnY7MQF=Mf7qgolkFiec>#H)pP>p5LBj4WH
zlkjZQSY=s0%O>x|U&Ch1sq5i8_0o4SK3_+_byg=eip?<E9ic{;{v&WI_?D~DP5PCi
zpU*K$TZyHn8vj!z)-q>=&7?XlY^7bu(MfOBPOF3@ip^^Pd+EU7c$kh9WgIDV%22+X
ze39~y>WVyCRzza(iBT=_*iw5MZ^=!&8O|swjbV-*q%V+ao0AJ@kZbA3jn55Y@KG~F
zOF2%<GI!FuaJF#L*}_P&v_j?}P1F#Y#%PzC44WlIoUl1S>s(`6UxcI_QdB(XO!PrV
ziX>>++v>9I<j8E6xd@sK)0-yWW?9q%IEam`)bvSQ7RokW&CeKZCMBVr-`&V2+ho(O
z8&|YSPO++e#q=L?%oPv(_~B6?MHBYZ0rzF+O&_|~ymDG;CKnd8ynvyLR3%R{wt#QV
zjGJa{nDK?%&Lrjb&1q|LBC7_j3dV_?S?6!6D`&j=>ifT6?>|)a(HkXZ)$-3SOScz+
zR!o)ae8U22I)P#|olmsC>D*BcLO_mt1W3ZqH;lP2nb*0g6P&oR&rjmym5MsSTP5$k
zwCDRP&OUMS%%yY6t1Od#IcHgYb}1jgHQyJB!h{Kvt9zx{7>phcM%t$^Y^k@_G&*}j
zqG4(W$dr+KpO~2A>>N@=`IH@$&!pEK#yd%!Q)y<c^mo;%NoVB9pQpa8{CP4>RsV_i
zfXw|5>1+C#39Vu-hSKDJM?X51=jp#`0r`pi@OG2OT)y2R^YZc*$Xj4mS)hX$Iz+VE
z!i`IiP^F$gW|+4oa;@CHG&{T?r3c{GQ!?z-=-|1Id=t+mZgMIp50Q@WY~k5L-8P=<
zoLnn?wenuinWu?69`)2p4;_?u(6Wt|svYF5g1aO|$%=e!$`t+Ev`~cmJp7mw{3sr`
zxk(DaA9m`63G)&`V;r7Bnt;Pk-5$j0c{?@YX1=Y^GX9s0F!?3UNo!V3&FxW!bDkpl
zJpC^%VCj0M7rY661-j9>o^A@4FX>J4`Ppm%qOJnJ3HnVJfupzHY*=GD2e?EZ@R-FH
zE;ia1u0FKnLd07KmY$Ay{h)>N70sq{iWo3k{Hp-QbLpE)?=~!~I`cO1@9OL41Mmtn
zcu&I9J++lY6{*@P*3HJJI8R9n{G0#(X$~^~Lzv3Y1!C$CBFN$jQdhG__(x^PJd<Ef
z<LP6zHQxlii5q5@ZBvpg<Z9oaDU~5z|D9+|O5OA$eN#6LO`KJNjC`g#R3_z1w#bcq
z>3T|+Cr}|5w@0zRF30uXukP~`XC`q{m(hv3Q$1H8U$^<{&aHgiL4BO;Y%jAP;}jNf
z&SBDI1P%26hC5m?gYhq+>(sqd;Z2gKXOH$7$bSO)?njv04$j0UAGvG){r}%T0j%!^
AEdT%j

literal 0
HcmV?d00001


From 5890fb53683e9cea791b699f96eb22990cfe48b7 Mon Sep 17 00:00:00 2001
From: yousrae2004 <183547839+yousrae2004@users.noreply.github.com>
Date: Sat, 16 May 2026 15:36:01 -0700
Subject: [PATCH 2/3] feat: extract and save PPTX images to disk for issue #56

---
 .../src/markitdown/converters/_pptx_converter.py    | 13 +++++++++++--
 packages/markitdown/tests/_test_vectors.py          |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index 360f17706..95ecbe91e 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -147,8 +147,17 @@ def get_shape_content(shape, **kwargs):
                         b64_string = base64.b64encode(blob).decode("utf-8")
                         md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
                     else:
-                        # A placeholder name
-                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                        #save image to disk to reference
+                        content_type = shape.image.content_type or "image/png"
+                        ext = content_type.split("/")[-1]
+                        filename = re.sub(r"\W", "", shape.name) + "." + ext
+
+                        #write image to same directory as output
+                        output_dir = kwargs.get("output_dir", ".")
+                        image_path = os.path.join(output_dir, filename)
+                        with open(image_path, "wb") as img_file:
+                            img_file.write(shape.image.blob)
+
                         md_content += "\n![" + alt_text + "](" + filename + ")\n"
 
                 # Tables
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
index 2aef247af..939d71ef3 100644
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -81,7 +81,7 @@ class FileTestVector(object):
             "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
             "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
             "2003",  # chart value
-            "![This phrase of the caption is Human-written.](Picture4.jpg)",
+            "![This phrase of the caption is Human-written.](Picture4.jpeg)",
         ],
         must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
     ),

From 3ad7625ae5a625646a30f4f2db3fd23c3abdf2f3 Mon Sep 17 00:00:00 2001
From: yousrae2004 <183547839+yousrae2004@users.noreply.github.com>
Date: Sat, 16 May 2026 20:57:14 -0700
Subject: [PATCH 3/3] apply review suggestions for doc and pptx conversion code

---
 .../markitdown/converters/_doc_converter.py   | 16 ++++++-------
 .../markitdown/converters/_pptx_converter.py  | 24 +++++++++++++++----
 packages/markitdown/tests/_test_vectors.py    |  7 ++++--
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py
index 626894516..cb4a71c0c 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py
@@ -1,10 +1,9 @@
 import sys
-#import re
 from typing import Any, BinaryIO
 
 from .._stream_info import StreamInfo
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, FileConversionException
 
 #using unword dependency
 _dependency_exc_info = None
@@ -25,7 +24,7 @@
 class DocConverter(DocumentConverter):
     """
     Converts DOC (Word 97-2003) files to Markdown. Uses unword package 
-    as parser backendto extract body text with heading levels,
+    as parser backend to extract body text with heading levels,
     page breaks, and textbox contents.
     """
 
@@ -64,10 +63,11 @@ def convert(
             ) from _dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
             )
-        assert unword is not None
-
-        # parse_doc takes raw bytes and returns a Document object
-        doc = unword.parse_doc(file_stream.read())
         
+        try:
+            doc = unword.parse_doc(file_stream.read())
+        except Exception as e:
+            raise FileConversionException(f"Failed to parse .doc file: {e}") from e
 
-        return DocumentConverterResult(markdown=doc.body_text.strip(), title=None)
+        title = getattr(doc, "title", None) or getattr(doc, "metadata", {}).get("title")
+        return DocumentConverterResult(markdown=doc.body_text.strip(), title=title)
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index 95ecbe91e..22abadd39 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -4,6 +4,7 @@
 import io
 import re
 import html
+import hashlib
 
 from typing import BinaryIO, Any
 from operator import attrgetter
@@ -140,6 +141,8 @@ def get_shape_content(shape, **kwargs):
                     alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                     alt_text = re.sub(r"\s+", " ", alt_text).strip()
 
+                    output_dir = kwargs.get("output_dir")
+
                     # If keep_data_uris is True, use base64 encoding for images
                     if kwargs.get("keep_data_uris", False):
                         blob = shape.image.blob
@@ -148,17 +151,28 @@ def get_shape_content(shape, **kwargs):
                         md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
                     else:
                         #save image to disk to reference
+                        blob = shape.image.blob
                         content_type = shape.image.content_type or "image/png"
-                        ext = content_type.split("/")[-1]
-                        filename = re.sub(r"\W", "", shape.name) + "." + ext
+                        ext_map = {"jpeg": "jpg", "svg+xml": "svg"}
+                        raw_ext = content_type.split("/")[-1]
+                        ext = ext_map.get(raw_ext, raw_ext)
+
+                        #add filename collision handling
+                        suffix = hashlib.md5(blob).hexdigest()[:8]
+                        safe_name = re.sub(r"\W", "", shape.name) if shape.name else "image"
+                        filename = f"{safe_name}_{suffix}.{ext}"
 
-                        #write image to same directory as output
                         output_dir = kwargs.get("output_dir", ".")
                         image_path = os.path.join(output_dir, filename)
-                        with open(image_path, "wb") as img_file:
-                            img_file.write(shape.image.blob)
+
+                        try:
+                            with open(image_path, "wb") as img_file:
+                                img_file.write(blob)
+                        except OSError as e:
+                            raise OSError(f"Failed to write image to '{image_path}'.") from e
 
                         md_content += "\n![" + alt_text + "](" + filename + ")\n"
+                        
 
                 # Tables
                 if self._is_table(shape):
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
index 939d71ef3..82b824cbb 100644
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -42,7 +42,10 @@ class FileTestVector(object):
             "#Test for converting .doc files to MD format",
             "Let’s test it!",
         ],
-        must_not_include=[],
+        must_not_include=[
+            "d0cf11e0",
+            "\x00"
+        ],
     ),
     FileTestVector(
         filename="test.xlsx",
@@ -81,7 +84,7 @@ class FileTestVector(object):
             "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
             "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
             "2003",  # chart value
-            "![This phrase of the caption is Human-written.](Picture4.jpeg)",
+            "![This phrase of the caption is Human-written.](Picture4_",
         ],
         must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
     ),