Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions packages/markitdown/src/markitdown/converters/_pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,16 @@ def get_shape_content(shape, **kwargs):

# Text areas
elif shape.has_text_frame:
# python-pptx can yield ``None`` for ``shape.text`` when a
# text frame has a run with no ``<a:t>`` child or when a
# third-party deck stores chart/SmartArt titles as
# ``None``. Coerce to "" so a single bad shape doesn't
# fail the entire conversion (#1808).
shape_text = shape.text or ""
if shape == title:
md_content += "# " + shape.text.lstrip() + "\n"
md_content += "# " + shape_text.lstrip() + "\n"
else:
md_content += shape.text + "\n"
md_content += shape_text + "\n"

# Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
Expand Down Expand Up @@ -194,7 +200,9 @@ def get_shape_content(shape, **kwargs):
md_content += "\n\n### Notes:\n"
notes_frame = slide.notes_slide.notes_text_frame
if notes_frame is not None:
md_content += notes_frame.text
# See note above re: ``shape.text`` returning ``None``;
# the same coercion applies to notes text frames (#1808).
md_content += notes_frame.text or ""
md_content = md_content.strip()

return DocumentConverterResult(markdown=md_content.strip())
Expand Down
113 changes: 113 additions & 0 deletions packages/markitdown/tests/test_pptx_none_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Regression test for #1808.

``PptxConverter.convert`` performed unguarded ``+`` concatenation on
``shape.text`` and ``notes_frame.text``. python-pptx can return ``None``
for either when a text frame has a run with no ``<a:t>`` child, or when
a third-party deck represents a chart/SmartArt title text as ``None``.
A single such shape used to fail the entire .pptx conversion with::

PptxConverter threw TypeError with message:
can only concatenate str (not "NoneType") to str

The fix coerces ``shape.text`` / ``notes_frame.text`` to ``""`` before
concatenation. This pins both call sites.
"""

from __future__ import annotations

from io import BytesIO

import pytest

pptx = pytest.importorskip("pptx") # markitdown[pptx] extra

from markitdown import MarkItDown


def _build_minimal_pptx_with_text(text: str) -> BytesIO:
"""Build a minimal one-slide .pptx that contains *text* in the body
placeholder. Returns a BytesIO suitable for MarkItDown.convert_stream.
"""
prs = pptx.Presentation()
# Layout 1 = "Title and Content"
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Hello"
body_placeholder = slide.placeholders[1]
body_placeholder.text = text
buf = BytesIO()
prs.save(buf)
buf.seek(0)
return buf


def test_pptx_with_none_shape_text_does_not_crash(monkeypatch):
"""Regression for #1808: a shape whose ``.text`` returns ``None`` must
not fail the whole conversion."""
# Build the deck BEFORE monkey-patching — replacing ``text`` with a
# getter-only property breaks the ``placeholder.text = ...`` setter.
stream = _build_minimal_pptx_with_text("World")

pptx_text_module = pptx.text.text
original_text = pptx_text_module.TextFrame.text.fget

def _none_for_world(self): # type: ignore[no-untyped-def]
value = original_text(self)
return None if value == "World" else value

monkeypatch.setattr(
pptx_text_module.TextFrame,
"text",
property(_none_for_world),
)

md = MarkItDown()
result = md.convert_stream(stream, file_extension=".pptx")

# Conversion completes without raising. The deck's body shape resolved
# to None and was treated as empty; the title row is still preserved
# because it doesn't trip the None branch.
assert "# Hello" in result.text_content


def test_pptx_with_none_notes_text_does_not_crash(monkeypatch):
"""Regression for #1808: ``notes_frame.text`` returning ``None`` must
not fail conversion either."""
# Build a deck and add a notes slide with text we'll monkey-patch to None.
prs = pptx.Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Hello"
notes = slide.notes_slide.notes_text_frame
notes.text = "speaker note"
buf = BytesIO()
prs.save(buf)
buf.seek(0)

# Patch only the notes frame's TextFrame.text accessor to None.
pptx_text_module = pptx.text.text
original_text = pptx_text_module.TextFrame.text.fget

def _none_for_speaker_note(self): # type: ignore[no-untyped-def]
value = original_text(self)
return None if value == "speaker note" else value

monkeypatch.setattr(
pptx_text_module.TextFrame,
"text",
property(_none_for_speaker_note),
)

md = MarkItDown()
result = md.convert_stream(buf, file_extension=".pptx")
# Conversion completes; title still appears.
assert "# Hello" in result.text_content


def test_pptx_normal_text_still_converts():
"""Regression guard: the coercion must not change behavior for normal
decks where ``shape.text`` is a real string."""
stream = _build_minimal_pptx_with_text("body content")
md = MarkItDown()
result = md.convert_stream(stream, file_extension=".pptx")
assert "# Hello" in result.text_content
assert "body content" in result.text_content