From 83f30333be961a25157c01b609f191e0fc4f32ad Mon Sep 17 00:00:00 2001 From: guyua9 Date: Tue, 28 Apr 2026 18:55:53 +0800 Subject: [PATCH] fix: parse charset parameters case-insensitively --- .../markitdown/src/markitdown/_markitdown.py | 5 +++-- .../markitdown/src/markitdown/_uri_utils.py | 4 ++-- packages/markitdown/tests/test_module_misc.py | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..7e25e4928 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -480,8 +480,9 @@ def convert_response( parts = response.headers["content-type"].split(";") mimetype = parts.pop(0).strip() for part in parts: - if part.strip().startswith("charset="): - _charset = part.split("=")[1].strip() + key, sep, value = part.strip().partition("=") + if sep and key.strip().lower() == "charset": + _charset = value.strip() if len(_charset) > 0: charset = _charset diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py index 603da63e9..e3c42808d 100644 --- a/packages/markitdown/src/markitdown/_uri_utils.py +++ b/packages/markitdown/src/markitdown/_uri_utils.py @@ -43,9 +43,9 @@ def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]: # Handle key=value pairs in the middle if "=" in part: key, value = part.split("=", 1) - attributes[key] = value + attributes[key.lower()] = value elif len(part) > 0: - attributes[part] = "" + attributes[part.lower()] = "" content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..837c7ad3e 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -219,6 +219,13 @@ def test_data_uris() -> None: assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" + data_uri = "data:text/plain;Charset=utf-8,Hello%2C%20World%21" + mime_type, attributes, data = parse_data_uri(data_uri) + assert mime_type == "text/plain" + assert len(attributes) == 1 + assert attributes["charset"] == "utf-8" + assert data == b"Hello, World!" + def test_file_uris() -> None: # Test file URI with an empty host @@ -252,6 +259,17 @@ def test_file_uris() -> None: assert path == "/path/to/file.txt" +def test_response_content_type_charset_is_case_insensitive() -> None: + response = MagicMock() + response.headers = {"content-type": "text/plain; Charset=UTF-8"} + response.url = "https://example.com/test.txt" + response.iter_content.return_value = [b"Hello, World!"] + + result = MarkItDown().convert_response(response) + + assert result.text_content == "Hello, World!" + + def test_docx_comments() -> None: # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")