Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ def _extract_form_content_from_words(page: Any) -> str | None:
break

# If row uses 2+ of the established columns, it's a table row
info["is_table_row"] = len(aligned_columns) >= 2
info["aligned"] = len(aligned_columns)
info["is_table_row"] = info["aligned"] >= 2

# Find table regions (consecutive table rows)
table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx)
Expand All @@ -298,6 +299,25 @@ def _extract_form_content_from_words(page: Any) -> str | None:
if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2:
return None

# Number of columns that will be used when extracting cells
num_cols = len(global_columns)

# Extra guard: multi-column academic prose can look like a very wide,
# sparsely populated table. Real form/table pages in this converter tend to
# use a modest number of stable columns; prose pages instead produce many
# tentative columns with only a few populated per row. Reject those before
# formatting markdown tables.
table_row_fill_ratios = [
info.get("aligned", 0) / num_cols
for info in row_info
if info.get("is_table_row")
]
if num_cols > 10 and table_row_fill_ratios:
sorted_fill_ratios = sorted(table_row_fill_ratios)
median_fill_ratio = sorted_fill_ratios[len(sorted_fill_ratios) // 2]
if median_fill_ratio < 0.4:
return None

# Build output - collect table data first, then format with proper column widths
result_lines: list[str] = []
num_cols = len(global_columns)
Expand Down
77 changes: 77 additions & 0 deletions packages/markitdown/tests/test_pdf_prose_layout_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
def _make_fake_page(width: float, rows: list[list[dict]]):
class FakePage:
def __init__(self, width: float, rows: list[list[dict]]):
self.width = width
self._words = []
for i, row in enumerate(rows):
y_top = 50 + i * 12
for w in row:
self._words.append(
{
"text": w["text"],
"x0": float(w["x0"]),
"x1": float(w["x0"]) + float(w.get("w", 12)),
"top": float(y_top),
}
)

def extract_words(self, keep_blank_chars=True, x_tolerance=3, y_tolerance=3):
return list(self._words)

return FakePage(width=width, rows=rows)


def test_multicolumn_prose_falls_back_to_text_extraction():
"""Regression: wide multi-column prose should not be emitted as a table.

This page shape mimics the failure mode from issue #120: many tentative
columns are discovered across the page, but each row only uses a small
fraction of them. That is typical of two-column prose with staggered word
positions, not real form/table data.
"""

from markitdown.converters._pdf_converter import _extract_form_content_from_words

# Thirteen stable x positions across the page; each row only touches four of
# them, which should be treated as sparse multi-column prose rather than a
# dense table.
x_positions = [50, 105, 160, 215, 270, 325, 380, 435, 490, 545, 600, 655, 710]
rows = []
for i in range(10):
start = i
selected = x_positions[start : start + 4]
rows.append(
[
{"x0": selected[0], "text": f"alpha{i}"},
{"x0": selected[1], "text": f"beta{i}"},
{"x0": selected[2], "text": f"gamma{i}"},
{"x0": selected[3], "text": f"delta{i}"},
]
)

fake_page = _make_fake_page(width=760, rows=rows)

assert _extract_form_content_from_words(fake_page) is None


def test_wide_dense_table_is_still_extracted():
"""Wide but dense tables should survive the sparse-prose guard."""

from markitdown.converters._pdf_converter import _extract_form_content_from_words

x_positions = [50, 105, 160, 215, 270, 325, 380, 435, 490, 545, 600]
rows = []
for i in range(6):
rows.append(
[
{"x0": x, "text": f"c{col}_{i}"}
for col, x in enumerate(x_positions)
]
)

fake_page = _make_fake_page(width=660, rows=rows)
output = _extract_form_content_from_words(fake_page)

assert output is not None
assert "|" in output