main.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import sys
  2. import os
  3. from bs4 import BeautifulSoup, NavigableString
  4. def sanitize_filename(title: str) -> str:
  5. return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
  6. def get_text_with_formatting(element) -> str:
  7. result = []
  8. for content in element.descendants:
  9. if isinstance(content, NavigableString):
  10. result.append(str(content))
  11. elif content.name in ("strong", "b"):
  12. inner = "".join(content.strings)
  13. result.append(f"**{inner}**")
  14. elif content.name in ("em", "i"):
  15. inner = "".join(content.strings)
  16. result.append(f"*{inner}*")
  17. elif content.name == "code":
  18. inner = "".join(content.strings)
  19. result.append(f"`{inner}`")
  20. # Skip tags already replaced (like <a>, handled before calling this)
  21. elif content.name in ("a", "img"):
  22. continue
  23. return " ".join(result).strip()
  24. def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
  25. md_lines = []
  26. for element in prose.children:
  27. if isinstance(element, NavigableString):
  28. text = element.strip()
  29. if text:
  30. md_lines.append(text)
  31. continue
  32. tag = element.name
  33. # Headings
  34. if tag.startswith("h") and tag[1:].isdigit():
  35. level = int(tag[1:])
  36. md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}")
  37. # Paragraphs with possible links or inline formatting
  38. elif tag == "p":
  39. for link in element.find_all("a"):
  40. href = link.get("href", "#")
  41. link_text = link.get_text(strip=True)
  42. link.replace_with(f"[{link_text}]({href})")
  43. for code in element.find_all("code"):
  44. code_text = code.get_text()
  45. code.replace_with(f"`{code_text}`")
  46. md_lines.append(get_text_with_formatting(element))
  47. # Lists
  48. elif tag == "ul":
  49. for li in element.find_all("li"):
  50. md_lines.append(f"- {li.get_text(strip=True)}")
  51. elif tag == "ol":
  52. for idx, li in enumerate(element.find_all("li"), start=1):
  53. md_lines.append(f"{idx}. {li.get_text(strip=True)}")
  54. # Preformatted code blocks
  55. elif tag == "pre":
  56. code = element.find("code")
  57. if code:
  58. lang_class = code.get("class", [])
  59. language = ""
  60. for cls in lang_class:
  61. if cls.startswith("language-"):
  62. language = cls.replace("language-", "")
  63. break
  64. code_text = code.get_text()
  65. md_lines.append(f"```{language}\n{code_text.strip()}\n```")
  66. # Images
  67. elif tag == "img":
  68. src = element.get("src", "")
  69. alt = element.get("alt", "")
  70. if src:
  71. md_lines.append(f"![{alt}]({src})")
  72. # Inline code
  73. elif tag == "code":
  74. code_text = get_text_with_formatting(element)
  75. md_lines.append(f"`{code_text}`")
  76. # Fallback
  77. else:
  78. text = get_text_with_formatting(element)
  79. if text:
  80. md_lines.append(text)
  81. return "\n\n".join(md_lines)
  82. def convert_chat_html_to_markdown(html_path: str) -> str:
  83. with open(html_path, "r", encoding="utf-8") as f:
  84. soup = BeautifulSoup(f, "html.parser")
  85. title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
  86. filename = sanitize_filename(title) + ".md"
  87. main = soup.find("main")
  88. if not main:
  89. raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
  90. h3_tags = main.find_all("h3")
  91. prose_blocks = main.find_all("div", class_="prose")
  92. messages = []
  93. for h3, prose in zip(h3_tags, prose_blocks):
  94. role = h3.get_text(strip=True)
  95. if "chatgpt" in role.lower():
  96. prefix = "**ChatGPT:**"
  97. else:
  98. prefix = "**You:**"
  99. body = extract_markdown_from_prose(prose)
  100. messages.append(f"{prefix}\n\n{body}")
  101. markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
  102. return filename, markdown
  103. if __name__ == "__main__":
  104. if len(sys.argv) != 2:
  105. print("Usage: python html_to_markdown.py <path_to_saved_html>")
  106. sys.exit(1)
  107. input_html = sys.argv[1]
  108. if not os.path.isfile(input_html):
  109. print(f"File not found: {input_html}")
  110. sys.exit(1)
  111. output_name, markdown_text = convert_chat_html_to_markdown(input_html)
  112. output_path = os.path.join("output", output_name)
  113. with open(output_path, "w", encoding="utf-8") as f:
  114. f.write(markdown_text)
  115. print(f"Markdown saved to: {output_path}")