diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..fdc21c779 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -110,6 +110,12 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--include-comments", + action="store_true", + help="Include Word comment references in output.", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -191,10 +197,14 @@ def main(): sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris, + include_comments=args.include_comments, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, + stream_info=stream_info, + keep_data_uris=args.keep_data_uris, + include_comments=args.include_comments, ) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..d66e56ab2 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -75,7 +75,13 @@ def convert( _dependency_exc_info[2] ) - style_map = kwargs.get("style_map", None) + style_map = kwargs.pop("style_map", None) + include_comments = kwargs.pop("include_comments", False) + + if include_comments: + comment_style = "comment-reference => sup" + style_map = (style_map + "\n" + comment_style) if style_map else comment_style + pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,