main.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. import sys
  2. import os
  3. from bs4 import BeautifulSoup, NavigableString
  4. def sanitize_filename(title: str) -> str:
  5. return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
  6. def get_text_with_formatting(element) -> str:
  7. def walk(node):
  8. if isinstance(node, NavigableString):
  9. return str(node)
  10. elif node.name in ("strong", "b"):
  11. return f"**{''.join(walk(c) for c in node.children)}**"
  12. elif node.name in ("em", "i"):
  13. return f"*{''.join(walk(c) for c in node.children)}*"
  14. elif node.name == "code":
  15. return f"`{''.join(walk(c) for c in node.children)}`"
  16. elif node.name == "a":
  17. href = node.get("href", "#")
  18. label = ''.join(walk(c) for c in node.children)
  19. return f"[{label}]({href})"
  20. elif node.name == "img":
  21. alt = node.get("alt", "")
  22. src = node.get("src", "")
  23. return f"![{alt}]({src})"
  24. else:
  25. return ''.join(walk(c) for c in node.children)
  26. return walk(element).strip()
  27. def extract_markdown_from_conversation(conversation: BeautifulSoup) -> str:
  28. md_lines = []
  29. for element in conversation.children:
  30. if isinstance(element, NavigableString):
  31. text = element.strip().replace("\n", " \n") # Preserve single spacing
  32. if text:
  33. md_lines.append(text)
  34. continue
  35. tag = element.name
  36. # Headings
  37. if tag.startswith("h") and tag[1:].isdigit():
  38. level = int(tag[1:])
  39. md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}")
  40. # Paragraphs with possible links or inline formatting
  41. elif tag == "p":
  42. for link in element.find_all("a"):
  43. href = link.get("href", "#")
  44. link_text = link.get_text(strip=True)
  45. link.replace_with(f"[{link_text}]({href})")
  46. for code in element.find_all("code"):
  47. code_text = code.get_text()
  48. code.replace_with(f"`{code_text}`")
  49. md_lines.append(get_text_with_formatting(element))
  50. # Preformatted code blocks
  51. elif tag == "pre":
  52. code = element.find("code")
  53. if code:
  54. lang_class = code.get("class", [])
  55. language = ""
  56. for cls in lang_class:
  57. if cls.startswith("language-"):
  58. language = cls.replace("language-", "")
  59. break
  60. code_text = code.get_text()
  61. md_lines.append(f"```{language}\n{code_text.strip()}\n```")
  62. # Images
  63. elif tag == "img":
  64. src = element.get("src", "")
  65. alt = element.get("alt", "")
  66. if src:
  67. md_lines.append(f"![{alt}]({src})")
  68. # Inline code
  69. elif tag == "code":
  70. code_text = get_text_with_formatting(element)
  71. md_lines.append(f"`{code_text}`")
  72. # Fallback
  73. else:
  74. text = get_text_with_formatting(element)
  75. if text:
  76. md_lines.append(text)
  77. return "\n\n".join(md_lines)
  78. def convert_chat_html_to_markdown(html_path: str) -> str:
  79. with open(html_path, "r", encoding="utf-8") as f:
  80. soup = BeautifulSoup(f, "html.parser")
  81. title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
  82. filename = sanitize_filename(title) + ".md"
  83. main = soup.find("main")
  84. if not main:
  85. raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
  86. messages = []
  87. conversation = main.find_all("div", attrs={'data-message-author-role': True})
  88. for conversation_turn in conversation:
  89. # ChatGPT response div class="prose"
  90. # User response div class="whitespace-pre-wrap"
  91. content = conversation_turn.find("div", class_=["prose", "whitespace-pre-wrap"])
  92. body = extract_markdown_from_conversation(content).strip()
  93. if not body:
  94. continue
  95. role = conversation_turn.get_attribute_list('data-message-author-role')
  96. if role[0] == "user":
  97. message = f"**You:**\n\n{body}"
  98. else:
  99. message = f"**ChatGPT**\n\n{body}"
  100. messages.append(message)
  101. markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
  102. return filename, markdown
  103. def main(args):
  104. if len(args) != 2:
  105. print("Usage: python html_to_markdown.py <path_to_saved_html>")
  106. sys.exit(1)
  107. input_html = args[1]
  108. if not os.path.isfile(input_html):
  109. input_html_ext = os.path.join("input", input_html)
  110. if not os.path.isfile(input_html_ext):
  111. print(f"File not found: {input_html}")
  112. sys.exit(1)
  113. else:
  114. input_html = input_html_ext
  115. output_name, markdown_text = convert_chat_html_to_markdown(input_html)
  116. os.makedirs("output", exist_ok=True)
  117. output_path = os.path.join("output", output_name)
  118. with open(output_path, "w", encoding="utf-8") as f:
  119. f.write(markdown_text)
  120. print(f"Markdown saved to: {output_path}")
  121. if __name__ == "__main__":
  122. main(sys.argv)