HTML2PDF/HTML2PDF.py at main · SebastianZzzz/HTML2PDF · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import argparse
import asyncio
import os
from pathlib import Path
from playwright.async_api import async_playwright

async def convert_single(input_source: str, output_path: str, is_url: bool):
    """将单个本地 HTML 或在线 URL 转换为 PDF"""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        try:
            if is_url:
                print(f"[*] Loading URL: {input_source}")
                await page.goto(input_source, wait_until="networkidle")
            else:
                abs_path = os.path.abspath(input_source)
                file_url = f"file://{abs_path}"
                print(f"[*] Loading local file: {abs_path}")
                await page.goto(file_url, wait_until="networkidle")

            await page.pdf(
                path=output_path,
                format="A4",
                print_background=True,
                margin={"top": "1cm", "right": "1cm", "bottom": "1cm", "left": "1cm"}
            )
            print(f"[+] Successfully saved PDF to: {output_path}")

        except Exception as e:
            print(f"[-] Error during conversion: {e}")
        finally:
            await browser.close()

async def convert_batch(input_dir: str, output_dir: str):
    """批量将目录中的所有 HTML 转换为 PDF"""
    input_path = Path(input_dir)
    output_path = Path(output_dir)

    if not input_path.exists() or not input_path.is_dir():
        print(f"[-] Input directory not found: '{input_dir}'")
        return

    output_path.mkdir(parents=True, exist_ok=True)
    html_files = list(input_path.rglob("*.html")) + list(input_path.rglob("*.htm"))

    if not html_files:
        print(f"[!] No HTML files found in '{input_dir}'.")
        return

    print(f"[*] Found {len(html_files)} HTML files. Starting batch conversion...\n")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        success_count = 0
        error_count = 0

        for html_file in html_files:
            abs_input_path = html_file.resolve()
            file_url = f"file://{abs_input_path}"
            pdf_filename = f"{html_file.stem}.pdf"
            abs_output_path = output_path / pdf_filename

            print(f"[*] Processing: {html_file.name}")
            page = await browser.new_page()

            try:
                await page.goto(file_url, wait_until="networkidle")
                await page.pdf(
                    path=str(abs_output_path),
                    format="A4",
                    print_background=True,
                    margin={"top": "1cm", "right": "1cm", "bottom": "1cm", "left": "1cm"}
                )
                print(f"  [+] Success -> {pdf_filename}")
                success_count += 1
            except Exception as e:
                print(f"  [-] Failed: {e}")
                error_count += 1
            finally:
                await page.close()

        await browser.close()

    print(f"\n[=] Batch completed! Success: {success_count}, Failed: {error_count}.")
    print(f"[=] Files saved to: {output_path.resolve()}")

def main():
    parser = argparse.ArgumentParser(description="HTML2PDF: A robust HTML/URL to PDF converter using Playwright.")

    # 互斥参数组：强制用户选择单文件模式或批量模式
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("-u", "--url", help="Convert a single online URL to PDF.")
    group.add_argument("-f", "--file", help="Convert a single local HTML file to PDF.")
    group.add_argument("-b", "--batch", help="Batch convert all HTML files in a directory.")

    parser.add_argument("-o", "--output", required=True, help="Output PDF file path (for -u/-f) or Output directory (for -b).")

    args = parser.parse_args()

    if args.url:
        asyncio.run(convert_single(args.url, args.output, is_url=True))
    elif args.file:
        if not os.path.exists(args.file):
            print(f"[-] Local file not found: {args.file}")
        else:
            asyncio.run(convert_single(args.file, args.output, is_url=False))
    elif args.batch:
        asyncio.run(convert_batch(args.batch, args.output))

if __name__ == "__main__":
    main()