diff --git a/README.md b/README.md index aa2f58bb8..5b5da303b 100644 --- a/README.md +++ b/README.md @@ -341,9 +341,9 @@ You can help by looking at issues or helping review PRs. Any issue or PR is welc ### Security Considerations -MarkItDown performs I/O with the privileges of the current process. Like `open()` or `requests.get()`, it will access resources that the process itself can access. +MarkItDown performs I/O with the privileges of the current process. Like `open()` or `requests.get()`, it will access resources that the process itself can access. -**Sanitize your inputs:** Do not pass untrusted input directly to MarkItDown. If any part of the input may be controlled by an untrusted user or system, such as in hosted or server-side applications, it must be validated and restricted before calling MarkItDown. Depending on your environment, this may include restricting file paths, limiting URI schemes and network destinations, and blocking access to private, loopback, link-local, or metadata-service addresses. +**Sanitize your inputs:** Do not pass untrusted input directly to MarkItDown. If any part of the input may be controlled by an untrusted user or system, such as in hosted or server-side applications, it must be validated and restricted before calling MarkItDown. Depending on your environment, this may include restricting file paths, limiting URI schemes and network destinations, and blocking access to private, loopback, link-local, or metadata-service addresses. **Call only the conversion method you need:** Prefer the narrowest conversion API that fits your use case. MarkItDown's `convert()` method is intentionally permissive and can handle local files, remote URIs, and byte streams. If your application only needs to read local files, call `convert_local()` instead. If you need more control over URI fetching, call `requests.get()` yourself and pass the response object to `convert_response()`. For maximum control, open a stream to the input you want converted and call `convert_stream()`. diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 029b27f57..6e3517b72 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -51,7 +51,13 @@ def convert( # Parse the stream encoding = "utf-8" if stream_info.charset is None else stream_info.charset - soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) + try: + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) + except (UnicodeDecodeError, LookupError): + # If the declared (or default) charset fails, let BeautifulSoup + # auto-detect the encoding from the raw bytes. + file_stream.seek(0) + soup = BeautifulSoup(file_stream, "html.parser") # Remove javascript and style blocks for script in soup(["script", "style"]): diff --git a/packages/markitdown/src/markitdown/converters/_rtf_converter.py b/packages/markitdown/src/markitdown/converters/_rtf_converter.py new file mode 100644 index 000000000..bf05904c6 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_rtf_converter.py @@ -0,0 +1,357 @@ +"""RTF (Rich Text Format) to Markdown converter. + +Parses RTF control words and groups to extract styled text, tables, and +Unicode escapes, producing Markdown output. +""" + +import re +from typing import Any, BinaryIO, Optional + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/rtf", + "application/rtf", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".rtf", +] + +# Mapping of RTF font-charset identifiers to Python codec names. +_CHARSET_MAP = { + 0: "cp1252", # ANSI + 1: "cp1252", # Default + 2: "symbol", + 77: "mac-roman", + 128: "cp932", # Shift-JIS + 129: "cp949", # Hangul + 134: "gb2312", + 136: "big5", + 161: "cp1253", # Greek + 162: "cp1254", # Turkish + 163: "cp1258", # Vietnamese + 177: "cp1255", # Hebrew + 178: "cp1256", # Arabic + 186: "cp1257", # Baltic + 204: "cp1251", # Russian + 222: "cp874", # Thai + 238: "cp1250", # Eastern European + 255: "cp437", # OEM +} + + +class _RTFToken: + """Represents a single RTF token produced by the lexer.""" + + __slots__ = ("kind", "value") + + # Token kinds + GROUP_START = "group_start" + GROUP_END = "group_end" + CONTROL_WORD = "control_word" + CONTROL_SYMBOL = "control_symbol" + TEXT = "text" + + def __init__(self, kind: str, value: str = ""): + self.kind = kind + self.value = value + + +# Pre-compiled patterns used by the lexer. +_CTRL_WORD_RE = re.compile(r"\\([a-zA-Z]+)(-?\d+)? ?") +_CTRL_SYMBOL_RE = re.compile(r"\\([^a-zA-Z\r\n])") +_HEX_ESCAPE_RE = re.compile(r"\\'([0-9a-fA-F]{2})") +_UNICODE_RE = re.compile(r"\\u(-?\d+)[?]?") + + +def _tokenize(rtf: str): + """Yield RTF tokens from *rtf* string.""" + pos = 0 + length = len(rtf) + while pos < length: + ch = rtf[pos] + if ch == "{": + yield _RTFToken(_RTFToken.GROUP_START) + pos += 1 + elif ch == "}": + yield _RTFToken(_RTFToken.GROUP_END) + pos += 1 + elif ch == "\\": + # Try Unicode escape first \\uN + m = _UNICODE_RE.match(rtf, pos) + if m: + yield _RTFToken(_RTFToken.CONTROL_WORD, m.group(0)) + pos = m.end() + continue + # Hex escape \\'XX + m = _HEX_ESCAPE_RE.match(rtf, pos) + if m: + yield _RTFToken(_RTFToken.CONTROL_SYMBOL, m.group(0)) + pos = m.end() + continue + # Control word \\word[-]N? + m = _CTRL_WORD_RE.match(rtf, pos) + if m: + yield _RTFToken(_RTFToken.CONTROL_WORD, m.group(0)) + pos = m.end() + continue + # Control symbol \\ + m = _CTRL_SYMBOL_RE.match(rtf, pos) + if m: + yield _RTFToken(_RTFToken.CONTROL_SYMBOL, m.group(0)) + pos = m.end() + continue + # Lone backslash (malformed) – skip + pos += 1 + elif ch in ("\r", "\n"): + pos += 1 + else: + # Collect plain text until next special character + start = pos + while pos < length and rtf[pos] not in ("{", "}", "\\", "\r", "\n"): + pos += 1 + yield _RTFToken(_RTFToken.TEXT, rtf[start:pos]) + + +def _parse_control_word(token_value: str): + """Return ``(word, param)`` from a control-word token value. + + *param* is ``None`` when the control word has no numeric parameter. + """ + m = _CTRL_WORD_RE.match(token_value) + if not m: + return token_value.lstrip("\\").rstrip(), None + word = m.group(1) + param = int(m.group(2)) if m.group(2) is not None else None + return word, param + + +def _decode_hex_escape(token_value: str, charset: str = "cp1252") -> str: + """Decode an RTF hex escape (``\\'XX``) using *charset*.""" + m = _HEX_ESCAPE_RE.search(token_value) + if not m: + return "" + byte_val = int(m.group(1), 16) + try: + return bytes([byte_val]).decode(charset) + except (UnicodeDecodeError, LookupError): + return bytes([byte_val]).decode("cp1252", errors="replace") + + +def _decode_unicode_escape(token_value: str) -> str: + """Decode an RTF Unicode escape (``\\uN``) to a Python character.""" + m = _UNICODE_RE.search(token_value) + if not m: + return "" + code_point = int(m.group(1)) + if code_point < 0: + code_point += 65536 + try: + return chr(code_point) + except (ValueError, OverflowError): + return "\ufffd" + + +class _RTFState: + """Mutable formatting state tracked while walking tokens.""" + + __slots__ = ("bold", "italic", "underline", "charset", "in_table", "cell_texts", "skip_group") + + def __init__(self): + self.bold = False + self.italic = False + self.underline = False + self.charset: str = "cp1252" + self.in_table = False + self.cell_texts: list[str] = [] + self.skip_group = False + + def copy(self) -> "_RTFState": + new = _RTFState() + new.bold = self.bold + new.italic = self.italic + new.underline = self.underline + new.charset = self.charset + new.in_table = self.in_table + new.cell_texts = list(self.cell_texts) + new.skip_group = self.skip_group + return new + + +# Destination control words whose group contents should be skipped entirely. +_SKIP_DESTINATIONS = frozenset([ + "fonttbl", "colortbl", "stylesheet", "info", "pict", + "header", "footer", "headerl", "headerr", "headerf", + "footerl", "footerr", "footerf", "footnote", + "field", "fldinst", "xe", "tc", "rxe", +]) + + +def _rtf_to_markdown(rtf: str) -> str: + """Convert raw RTF text to a Markdown string.""" + output_parts: list[str] = [] + state_stack: list[_RTFState] = [] + state = _RTFState() + table_rows: list[list[str]] = [] + + for token in _tokenize(rtf): + if token.kind == _RTFToken.GROUP_START: + state_stack.append(state.copy()) + continue + + if token.kind == _RTFToken.GROUP_END: + if state_stack: + state = state_stack.pop() + continue + + if state.skip_group: + continue + + if token.kind == _RTFToken.CONTROL_WORD: + word, param = _parse_control_word(token.value) + + # Check for destination groups to skip + if word in _SKIP_DESTINATIONS: + state.skip_group = True + continue + + # Style toggles + if word == "b": + state.bold = param != 0 if param is not None else True + elif word == "i": + state.italic = param != 0 if param is not None else True + elif word == "ul" or word == "ulnone": + state.underline = word == "ul" + elif word == "plain": + state.bold = False + state.italic = False + state.underline = False + + # Paragraph / line breaks + elif word == "par" or word == "line": + output_parts.append("\n\n" if word == "par" else "\n") + elif word == "tab": + output_parts.append("\t") + + # Table handling + elif word == "trowd": + state.in_table = True + state.cell_texts = [] + elif word == "cell": + state.cell_texts.append("".join(output_parts).split("\n")[-1].strip() if output_parts else "") + # Remove the last text segment that was part of this cell + if output_parts: + last_newline = -1 + for idx in range(len(output_parts) - 1, -1, -1): + if "\n" in output_parts[idx]: + last_newline = idx + break + if last_newline >= 0: + output_parts = output_parts[: last_newline + 1] + else: + output_parts = [] + elif word == "row": + if state.cell_texts: + table_rows.append(list(state.cell_texts)) + state.cell_texts = [] + state.in_table = False + + # Unicode escape + elif word == "u": + ch = _decode_unicode_escape(token.value) + output_parts.append(ch) + + # Charset (from font table, but we track the last one seen) + elif word == "fcharset": + if param is not None and param in _CHARSET_MAP: + state.charset = _CHARSET_MAP[param] + + continue + + if token.kind == _RTFToken.CONTROL_SYMBOL: + if token.value.startswith("\\'"): + ch = _decode_hex_escape(token.value, state.charset) + output_parts.append(ch) + elif token.value == "\\~": + output_parts.append("\u00a0") # non-breaking space + elif token.value == "\\-": + pass # optional hyphen – ignore + elif token.value == "\\_": + output_parts.append("\u2011") # non-breaking hyphen + continue + + if token.kind == _RTFToken.TEXT: + text = token.value + # Apply inline formatting + if state.bold and state.italic: + text = f"***{text}***" + elif state.bold: + text = f"**{text}**" + elif state.italic: + text = f"*{text}*" + if state.underline: + text = f"{text}" + output_parts.append(text) + + # Flush any pending table rows + md = "".join(output_parts) + + if table_rows: + table_md_parts: list[str] = [] + for row_idx, row in enumerate(table_rows): + table_md_parts.append("| " + " | ".join(row) + " |") + if row_idx == 0: + table_md_parts.append("| " + " | ".join("---" for _ in row) + " |") + md = md.rstrip() + "\n\n" + "\n".join(table_md_parts) + + # Collapse excessive blank lines + md = re.sub(r"\n{3,}", "\n\n", md) + return md.strip() + + +class RtfConverter(DocumentConverter): + """Convert RTF documents to Markdown.""" + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + raw_bytes = file_stream.read() + + # Try UTF-8 first, then fall back to latin-1 which never fails. + charset = stream_info.charset + if charset: + try: + rtf_text = raw_bytes.decode(charset) + except (UnicodeDecodeError, LookupError): + rtf_text = raw_bytes.decode("latin-1") + else: + try: + rtf_text = raw_bytes.decode("utf-8") + except UnicodeDecodeError: + rtf_text = raw_bytes.decode("latin-1") + + markdown = _rtf_to_markdown(rtf_text) + return DocumentConverterResult(markdown=markdown) diff --git a/packages/markitdown/tests/test_rtf_converter.py b/packages/markitdown/tests/test_rtf_converter.py new file mode 100644 index 000000000..98854f73c --- /dev/null +++ b/packages/markitdown/tests/test_rtf_converter.py @@ -0,0 +1,172 @@ +"""Tests for the RTF converter.""" + +import io +import pytest + +from markitdown.converters._rtf_converter import RtfConverter, _rtf_to_markdown +from markitdown._base_converter import DocumentConverterResult +from markitdown._stream_info import StreamInfo + + +@pytest.fixture +def converter(): + return RtfConverter() + + +def _make_stream_info(extension=".rtf", mimetype="text/rtf", charset=None): + return StreamInfo(extension=extension, mimetype=mimetype, charset=charset) + + +# ---- Basic acceptance tests ---- + +class TestRtfConverterAccepts: + def test_accepts_rtf_extension(self, converter): + stream = io.BytesIO(b"") + info = _make_stream_info(extension=".rtf") + assert converter.accepts(stream, info) is True + + def test_accepts_rtf_mimetype(self, converter): + stream = io.BytesIO(b"") + info = _make_stream_info(extension="", mimetype="text/rtf") + assert converter.accepts(stream, info) is True + + def test_accepts_application_rtf_mimetype(self, converter): + stream = io.BytesIO(b"") + info = _make_stream_info(extension="", mimetype="application/rtf") + assert converter.accepts(stream, info) is True + + def test_rejects_txt_extension(self, converter): + stream = io.BytesIO(b"") + info = _make_stream_info(extension=".txt", mimetype="") + assert converter.accepts(stream, info) is False + + def test_rejects_html_mimetype(self, converter): + stream = io.BytesIO(b"") + info = _make_stream_info(extension="", mimetype="text/html") + assert converter.accepts(stream, info) is False + + +# ---- Conversion tests ---- + +MINIMAL_RTF = rb"{\rtf1 Hello World}" +BOLD_RTF = rb"{\rtf1 {\b Bold Text}}" +ITALIC_RTF = rb"{\rtf1 {\i Italic Text}}" +UNDERLINE_RTF = rb"{\rtf1 {\ul Underline Text}}" +BOLD_ITALIC_RTF = rb"{\rtf1 {\b\i Bold Italic}}" +PARA_RTF = rb"{\rtf1 First\par Second}" +UNICODE_RTF = rb"{\rtf1 Euro sign: \u8364?}" +HEX_RTF = rb"{\rtf1 caf\'e9}" +TABLE_RTF = rb"{\rtf1 \trowd Cell1\cell Cell2\cell\row}" + + +class TestRtfConversion: + def test_minimal_rtf(self, converter): + result = converter.convert(io.BytesIO(MINIMAL_RTF), _make_stream_info()) + assert isinstance(result, DocumentConverterResult) + assert "Hello World" in result.markdown + + def test_bold_text(self, converter): + result = converter.convert(io.BytesIO(BOLD_RTF), _make_stream_info()) + assert "**Bold Text**" in result.markdown + + def test_italic_text(self, converter): + result = converter.convert(io.BytesIO(ITALIC_RTF), _make_stream_info()) + assert "*Italic Text*" in result.markdown + + def test_underline_text(self, converter): + result = converter.convert(io.BytesIO(UNDERLINE_RTF), _make_stream_info()) + assert "Underline Text" in result.markdown + + def test_bold_italic_text(self, converter): + result = converter.convert(io.BytesIO(BOLD_ITALIC_RTF), _make_stream_info()) + assert "***Bold Italic***" in result.markdown + + def test_paragraph_break(self, converter): + result = converter.convert(io.BytesIO(PARA_RTF), _make_stream_info()) + assert "First" in result.markdown + assert "Second" in result.markdown + + def test_unicode_escape(self, converter): + result = converter.convert(io.BytesIO(UNICODE_RTF), _make_stream_info()) + assert "\u20ac" in result.markdown # Euro sign € + + def test_hex_escape(self, converter): + result = converter.convert(io.BytesIO(HEX_RTF), _make_stream_info()) + assert "café" in result.markdown + + def test_table_conversion(self, converter): + result = converter.convert(io.BytesIO(TABLE_RTF), _make_stream_info()) + assert "Cell1" in result.markdown + assert "Cell2" in result.markdown + assert "|" in result.markdown + + def test_empty_rtf(self, converter): + result = converter.convert(io.BytesIO(rb"{\rtf1 }"), _make_stream_info()) + assert result.markdown == "" + + def test_skip_fonttbl_group(self, converter): + rtf = rb"{\rtf1 {\fonttbl{\f0 Times New Roman;}}Hello}" + result = converter.convert(io.BytesIO(rtf), _make_stream_info()) + # Font table content should not appear + assert "Times New Roman" not in result.markdown + assert "Hello" in result.markdown + + def test_skip_colortbl_group(self, converter): + rtf = rb"{\rtf1 {\colortbl;\red255\green0\blue0;}Colored text}" + result = converter.convert(io.BytesIO(rtf), _make_stream_info()) + assert "red255" not in result.markdown + assert "Colored text" in result.markdown + + def test_latin1_fallback_encoding(self, converter): + # Create RTF with bytes that are valid latin-1 but not valid UTF-8 + rtf = b"{\\rtf1 caf\\'\xe9}" # é in latin-1 + result = converter.convert(io.BytesIO(rtf), _make_stream_info()) + assert "caf" in result.markdown + + def test_charset_specified_in_stream_info(self, converter): + info = _make_stream_info(charset="latin-1") + result = converter.convert(io.BytesIO(MINIMAL_RTF), info) + assert "Hello World" in result.markdown + + def test_nested_groups(self, converter): + rtf = rb"{\rtf1 {Outer {Inner text} Outer2}}" + result = converter.convert(io.BytesIO(rtf), _make_stream_info()) + assert "Outer" in result.markdown + assert "Inner text" in result.markdown + assert "Outer2" in result.markdown + + def test_nonbreaking_space(self, converter): + rtf = rb"{\rtf1 word\~word}" + result = converter.convert(io.BytesIO(rtf), _make_stream_info()) + assert "word\u00a0word" in result.markdown + + def test_tab_control_word(self, converter): + rtf = rb"{\rtf1 col1\tab col2}" + result = converter.convert(io.BytesIO(rtf), _make_stream_info()) + assert "col1\tcol2" in result.markdown + + +# ---- Low-level function tests ---- + +class TestRtfToMarkdown: + def test_plain_text(self): + assert _rtf_to_markdown("{\\rtf1 plain text}") == "plain text" + + def test_multiple_paragraphs(self): + md = _rtf_to_markdown("{\\rtf1 A\\par B\\par C}") + assert "A" in md + assert "B" in md + assert "C" in md + + def test_style_reset_with_plain(self): + md = _rtf_to_markdown("{\\rtf1 \\b bold\\plain normal}") + assert "**bold**" in md + assert "normal" in md + # "normal" should not be bold + assert "**normal**" not in md + + def test_negative_unicode_codepoint(self): + # RTF uses negative numbers for codepoints > 32767 + md = _rtf_to_markdown("{\\rtf1 \\u-10179?}") + # -10179 + 65536 = 55357 which is a surrogate, chr() may produce \ufffd + assert len(md) > 0