浏览代码

Initial commit

Josh Rosario 8 月之前
当前提交
41b2b962dc
共有 3 个文件被更改,包括 58 次插入0 次删除
  1. 2 0
      .gitignore
  2. 53 0
      main.py
  3. 3 0
      requirements.txt

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+venv
+input

+ 53 - 0
main.py

@@ -0,0 +1,53 @@
+import sys
+import os
+from bs4 import BeautifulSoup
+
+def sanitize_filename(title: str) -> str:
+    return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
+
+def convert_chat_html_to_markdown(html_path: str) -> str:
+    with open(html_path, "r", encoding="utf-8") as f:
+        soup = BeautifulSoup(f, "html.parser")
+
+    title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
+    filename = sanitize_filename(title) + ".md"
+
+    # Find the main chat container
+    main_content = soup.find("main")
+    if not main_content:
+        raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
+
+    # Each message: role in h3, content in a sibling div
+    h3s = main_content.find_all("h3")
+    prose_divs = main_content.find_all("div", class_="prose")
+
+    if len(h3s) != len(prose_divs):
+        print("Warning: Number of roles and messages doesn't match. Continuing anyway...")
+
+    messages = []
+    for role_elem, content_elem in zip(h3s, prose_divs):
+        role = role_elem.get_text(strip=True)
+        content = content_elem.get_text(separator="\n", strip=True)
+        role_prefix = "**ChatGPT:**" if "chatgpt" in role.lower() else "**You:**"
+        messages.append(f"{role_prefix}\n\n{content}")
+
+    markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
+    return filename, markdown
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python html_to_markdown.py <path_to_saved_html>")
+        sys.exit(1)
+
+    input_html = sys.argv[1]
+    if not os.path.isfile(input_html):
+        print(f"File not found: {input_html}")
+        sys.exit(1)
+
+    output_name, markdown_text = convert_chat_html_to_markdown(input_html)
+    output_path = os.path.join(os.path.dirname(input_html), output_name)
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(markdown_text)
+
+    print(f"Markdown saved to: {output_path}")

+ 3 - 0
requirements.txt

@@ -0,0 +1,3 @@
+beautifulsoup4==4.13.4
+soupsieve==2.7
+typing_extensions==4.13.2