main.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. import sys
  2. import os
  3. from bs4 import BeautifulSoup
  4. def sanitize_filename(title: str) -> str:
  5. return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
  6. def convert_chat_html_to_markdown(html_path: str) -> str:
  7. with open(html_path, "r", encoding="utf-8") as f:
  8. soup = BeautifulSoup(f, "html.parser")
  9. title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
  10. filename = sanitize_filename(title) + ".md"
  11. # Find the main chat container
  12. main_content = soup.find("main")
  13. if not main_content:
  14. raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
  15. # Each message: role in h3, content in a sibling div
  16. h3s = main_content.find_all("h3")
  17. prose_divs = main_content.find_all("div", class_="prose")
  18. if len(h3s) != len(prose_divs):
  19. print("Warning: Number of roles and messages doesn't match. Continuing anyway...")
  20. messages = []
  21. for role_elem, content_elem in zip(h3s, prose_divs):
  22. role = role_elem.get_text(strip=True)
  23. content = content_elem.get_text(separator="\n", strip=True)
  24. role_prefix = "**ChatGPT:**" if "chatgpt" in role.lower() else "**You:**"
  25. messages.append(f"{role_prefix}\n\n{content}")
  26. markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
  27. return filename, markdown
  28. if __name__ == "__main__":
  29. if len(sys.argv) != 2:
  30. print("Usage: python html_to_markdown.py <path_to_saved_html>")
  31. sys.exit(1)
  32. input_html = sys.argv[1]
  33. if not os.path.isfile(input_html):
  34. print(f"File not found: {input_html}")
  35. sys.exit(1)
  36. output_name, markdown_text = convert_chat_html_to_markdown(input_html)
  37. output_path = os.path.join(os.path.dirname(input_html), output_name)
  38. with open(output_path, "w", encoding="utf-8") as f:
  39. f.write(markdown_text)
  40. print(f"Markdown saved to: {output_path}")