| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- import sys
- import os
- from bs4 import BeautifulSoup
- def sanitize_filename(title: str) -> str:
- return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
- def convert_chat_html_to_markdown(html_path: str) -> str:
- with open(html_path, "r", encoding="utf-8") as f:
- soup = BeautifulSoup(f, "html.parser")
- title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
- filename = sanitize_filename(title) + ".md"
- # Find the main chat container
- main_content = soup.find("main")
- if not main_content:
- raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
- # Each message: role in h3, content in a sibling div
- h3s = main_content.find_all("h3")
- prose_divs = main_content.find_all("div", class_="prose")
- if len(h3s) != len(prose_divs):
- print("Warning: Number of roles and messages doesn't match. Continuing anyway...")
- messages = []
- for role_elem, content_elem in zip(h3s, prose_divs):
- role = role_elem.get_text(strip=True)
- content = content_elem.get_text(separator="\n", strip=True)
- role_prefix = "**ChatGPT:**" if "chatgpt" in role.lower() else "**You:**"
- messages.append(f"{role_prefix}\n\n{content}")
- markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
- return filename, markdown
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- print("Usage: python html_to_markdown.py <path_to_saved_html>")
- sys.exit(1)
- input_html = sys.argv[1]
- if not os.path.isfile(input_html):
- print(f"File not found: {input_html}")
- sys.exit(1)
- output_name, markdown_text = convert_chat_html_to_markdown(input_html)
- output_path = os.path.join(os.path.dirname(input_html), output_name)
- with open(output_path, "w", encoding="utf-8") as f:
- f.write(markdown_text)
- print(f"Markdown saved to: {output_path}")
|