main.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. import sys
  2. import os
  3. from datetime import datetime, timezone
  4. from bs4 import BeautifulSoup, NavigableString
  5. def sanitize_filename(title: str) -> str:
  6. return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
  7. def get_text_with_formatting(element) -> str:
  8. def walk(node):
  9. if isinstance(node, NavigableString):
  10. return str(node)
  11. elif node.name in ("strong", "b"):
  12. return f"**{''.join(walk(c) for c in node.children)}**"
  13. elif node.name in ("em", "i"):
  14. return f"*{''.join(walk(c) for c in node.children)}*"
  15. elif node.name == "code":
  16. return f"`{''.join(walk(c) for c in node.children)}`"
  17. elif node.name == "a":
  18. href = node.get("href", "#")
  19. label = ''.join(walk(c) for c in node.children)
  20. return f"[{label}]({href})"
  21. elif node.name == "img":
  22. alt = node.get("alt", "")
  23. src = node.get("src", "")
  24. return f"![{alt}]({src})"
  25. else:
  26. return ''.join(walk(c) for c in node.children)
  27. return walk(element).strip()
  28. def extract_markdown_from_conversation(conversation: BeautifulSoup) -> str:
  29. md_lines = []
  30. for element in conversation.children:
  31. if isinstance(element, NavigableString):
  32. text = element.strip().replace("\n", " \n") # Preserve single spacing
  33. text = text.replace("<", "\<").replace(">", "\>") # Escape HTML tags that aren't in Code blocks
  34. if text:
  35. md_lines.append(text)
  36. continue
  37. tag = element.name
  38. # Headings
  39. if tag.startswith("h") and tag[1:].isdigit():
  40. level = int(tag[1:])
  41. md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}")
  42. # Paragraphs with possible links or inline formatting
  43. elif tag == "p":
  44. for link in element.find_all("a"):
  45. href = link.get("href", "#")
  46. link_text = link.get_text(strip=True)
  47. link.replace_with(f"[{link_text}]({href})")
  48. for code in element.find_all("code"):
  49. code_text = code.get_text()
  50. code.replace_with(f"`{code_text}`")
  51. md_lines.append(get_text_with_formatting(element))
  52. # Preformatted code blocks
  53. elif tag == "pre":
  54. code = element.find("code")
  55. if code:
  56. lang_class = code.get("class", [])
  57. language = ""
  58. for cls in lang_class:
  59. if cls.startswith("language-"):
  60. language = cls.replace("language-", "")
  61. break
  62. code_text = code.get_text()
  63. md_lines.append(f"```{language}\n{code_text.strip()}\n```")
  64. # Images
  65. elif tag == "img":
  66. src = element.get("src", "")
  67. alt = element.get("alt", "")
  68. if src:
  69. md_lines.append(f"![{alt}]({src})")
  70. # Inline code
  71. elif tag == "code":
  72. code_text = get_text_with_formatting(element)
  73. md_lines.append(f"`{code_text}`")
  74. # Tables
  75. elif tag == "table":
  76. def extract_rows(section):
  77. return section.find_all("tr") if section else []
  78. thead = element.find("thead")
  79. tbody = element.find("tbody")
  80. tfoot = element.find("tfoot")
  81. rows = extract_rows(thead) + extract_rows(tbody) + extract_rows(element) + extract_rows(tfoot)
  82. seen = set()
  83. filtered_rows = []
  84. for tr in rows:
  85. if tr not in seen:
  86. seen.add(tr)
  87. filtered_rows.append(tr)
  88. if not filtered_rows:
  89. continue
  90. table_lines = []
  91. for row_idx, tr in enumerate(filtered_rows):
  92. cells = tr.find_all(["th", "td"])
  93. row = [get_text_with_formatting(cell).strip() for cell in cells]
  94. line = "| " + " | ".join(row) + " |"
  95. table_lines.append(line)
  96. # After the first row, add separator (assuming it's the header)
  97. if row_idx == 0:
  98. separator = "| " + " | ".join(["---"] * len(row)) + " |"
  99. table_lines.insert(1, separator)
  100. md_lines.append("\n".join(table_lines))
  101. # Fallback
  102. else:
  103. text = get_text_with_formatting(element)
  104. if text:
  105. md_lines.append(text)
  106. return "\n\n".join(md_lines)
  107. def convert_chat_html_to_markdown(html_path: str) -> str:
  108. with open(html_path, "r", encoding="utf-8") as f:
  109. soup = BeautifulSoup(f, "html.parser")
  110. title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
  111. filename = sanitize_filename(title) + ".md"
  112. main = soup.find("main")
  113. if not main:
  114. raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
  115. messages = []
  116. conversation = main.find_all("div", attrs={'data-message-author-role': True})
  117. for conversation_turn in conversation:
  118. # ChatGPT response div class="prose"
  119. # User response div class="whitespace-pre-wrap"
  120. content = conversation_turn.find("div", class_=["prose", "whitespace-pre-wrap"])
  121. body = extract_markdown_from_conversation(content).strip()
  122. if not body:
  123. continue
  124. role = conversation_turn.get_attribute_list('data-message-author-role')
  125. if role[0] == "user":
  126. message = f"**You:**\n\n{body}"
  127. else:
  128. message = f"**ChatGPT**\n\n{body}"
  129. messages.append(message)
  130. dd_trace_time = int(int(soup.find("meta", {"name":"dd-trace-time"}).attrs["content"]) / 1000) # UTC time
  131. timestamp = datetime.fromtimestamp(dd_trace_time).strftime("%c")
  132. header = f"*ChatGPT conversation saved {timestamp} converted to Markdown*"
  133. markdown = f"# {title}\n\n{header}\n\n---\n\n" + "\n\n---\n\n".join(messages)
  134. return filename, markdown
  135. def main(args):
  136. if len(args) != 2:
  137. print("Usage: python html_to_markdown.py <path_to_saved_html>")
  138. sys.exit(1)
  139. input_html = args[1]
  140. if not os.path.isfile(input_html):
  141. input_html_ext = os.path.join("input", input_html)
  142. if not os.path.isfile(input_html_ext):
  143. print(f"File not found: {input_html}")
  144. sys.exit(1)
  145. else:
  146. input_html = input_html_ext
  147. output_name, markdown_text = convert_chat_html_to_markdown(input_html)
  148. os.makedirs("output", exist_ok=True)
  149. output_path = os.path.join("output", output_name)
  150. with open(output_path, "w", encoding="utf-8") as f:
  151. f.write(markdown_text)
  152. print(f"Markdown saved to: {output_path}")
  153. if __name__ == "__main__":
  154. main(sys.argv)