diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..e527a0c6f 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -207,15 +207,20 @@ def _handle_output(args, result: DocumentConverterResult): f.write(result.markdown) else: # Handle stdout encoding errors more gracefully - print( - result.markdown.encode(sys.stdout.encoding, errors="replace").decode( - sys.stdout.encoding - ) - ) + # Use errors="replace" to avoid UnicodeEncodeError on Windows (charmap codec) + try: + print(result.markdown) + except UnicodeEncodeError: + # Fallback for Windows or terminals that can't handle certain characters + print(result.markdown.encode(sys.stdout.encoding or "utf-8", errors="replace").decode(sys.stdout.encoding or "utf-8")) def _exit_with_error(message: str): - print(message) + # Handle encoding errors gracefully on Windows (charmap codec) + try: + print(message) + except UnicodeEncodeError: + print(message.encode(sys.stdout.encoding or "utf-8", errors="replace").decode(sys.stdout.encoding or "utf-8")) sys.exit(1) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..7c93c9834 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -432,6 +432,71 @@ def test_exceptions() -> None: assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" +def test_unicode_encoding_in_cli() -> None: + """Test that Unicode characters don't cause UnicodeEncodeError on Windows (charmap codec). + + This test verifies the fix for issue #1802: UnicodeEncodeError when converting + docx files containing Unicode characters on Windows systems where the terminal + or file encoding defaults to charmap (cp1252) instead of UTF-8. + """ + import subprocess + import tempfile + import sys + + # Test content with various Unicode characters (Chinese, Japanese, emojis, etc.) + unicode_test_content = "Hello 世界 🌍 你好" + + # Create a simple HTML file with Unicode content + html_content = f""" + +Unicode Test + +

{unicode_test_content}

+

Testing Unicode encoding: αβγδ εζηθ

+ +""" + + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f: + f.write(html_content) + html_file = f.name + + try: + # Test stdout output - should not raise UnicodeEncodeError + result = subprocess.run( + [sys.executable, "-m", "markitdown", html_file], + capture_output=True, + text=True, + encoding="utf-8" + ) + assert result.returncode == 0, f"CLI failed: {result.stderr}" + assert "Hello" in result.stdout + assert "世界" in result.stdout + + # Test file output with explicit UTF-8 encoding + with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as out_f: + output_file = out_f.name + + result = subprocess.run( + [sys.executable, "-m", "markitdown", "-o", output_file, html_file], + capture_output=True, + text=True, + encoding="utf-8" + ) + assert result.returncode == 0, f"CLI file output failed: {result.stderr}" + + # Verify the output file contains Unicode characters + with open(output_file, 'r', encoding='utf-8') as f: + output_content = f.read() + assert "Hello" in output_content + assert "世界" in output_content + + import os + os.remove(output_file) + finally: + import os + os.remove(html_file) + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed",