|
@@ -0,0 +1,53 @@
|
|
|
|
|
+import sys
|
|
|
|
|
+import os
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
|
+
|
|
|
|
|
+def sanitize_filename(title: str) -> str:
|
|
|
|
|
+ return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
|
|
|
|
|
+
|
|
|
|
|
+def convert_chat_html_to_markdown(html_path: str) -> str:
|
|
|
|
|
+ with open(html_path, "r", encoding="utf-8") as f:
|
|
|
|
|
+ soup = BeautifulSoup(f, "html.parser")
|
|
|
|
|
+
|
|
|
|
|
+ title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
|
|
|
|
|
+ filename = sanitize_filename(title) + ".md"
|
|
|
|
|
+
|
|
|
|
|
+ # Find the main chat container
|
|
|
|
|
+ main_content = soup.find("main")
|
|
|
|
|
+ if not main_content:
|
|
|
|
|
+ raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
|
|
|
|
|
+
|
|
|
|
|
+ # Each message: role in h3, content in a sibling div
|
|
|
|
|
+ h3s = main_content.find_all("h3")
|
|
|
|
|
+ prose_divs = main_content.find_all("div", class_="prose")
|
|
|
|
|
+
|
|
|
|
|
+ if len(h3s) != len(prose_divs):
|
|
|
|
|
+ print("Warning: Number of roles and messages doesn't match. Continuing anyway...")
|
|
|
|
|
+
|
|
|
|
|
+ messages = []
|
|
|
|
|
+ for role_elem, content_elem in zip(h3s, prose_divs):
|
|
|
|
|
+ role = role_elem.get_text(strip=True)
|
|
|
|
|
+ content = content_elem.get_text(separator="\n", strip=True)
|
|
|
|
|
+ role_prefix = "**ChatGPT:**" if "chatgpt" in role.lower() else "**You:**"
|
|
|
|
|
+ messages.append(f"{role_prefix}\n\n{content}")
|
|
|
|
|
+
|
|
|
|
|
+ markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
|
|
|
|
|
+ return filename, markdown
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ if len(sys.argv) != 2:
|
|
|
|
|
+ print("Usage: python html_to_markdown.py <path_to_saved_html>")
|
|
|
|
|
+ sys.exit(1)
|
|
|
|
|
+
|
|
|
|
|
+ input_html = sys.argv[1]
|
|
|
|
|
+ if not os.path.isfile(input_html):
|
|
|
|
|
+ print(f"File not found: {input_html}")
|
|
|
|
|
+ sys.exit(1)
|
|
|
|
|
+
|
|
|
|
|
+ output_name, markdown_text = convert_chat_html_to_markdown(input_html)
|
|
|
|
|
+ output_path = os.path.join(os.path.dirname(input_html), output_name)
|
|
|
|
|
+
|
|
|
|
|
+ with open(output_path, "w", encoding="utf-8") as f:
|
|
|
|
|
+ f.write(markdown_text)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Markdown saved to: {output_path}")
|