import sys import os from bs4 import BeautifulSoup def sanitize_filename(title: str) -> str: return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100] def convert_chat_html_to_markdown(html_path: str) -> str: with open(html_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") title = soup.title.string.strip() if soup.title else "chatgpt_conversation" filename = sanitize_filename(title) + ".md" # Find the main chat container main_content = soup.find("main") if not main_content: raise ValueError("Could not find
in HTML. Is this a valid saved ChatGPT conversation?") # Each message: role in h3, content in a sibling div h3s = main_content.find_all("h3") prose_divs = main_content.find_all("div", class_="prose") if len(h3s) != len(prose_divs): print("Warning: Number of roles and messages doesn't match. Continuing anyway...") messages = [] for role_elem, content_elem in zip(h3s, prose_divs): role = role_elem.get_text(strip=True) content = content_elem.get_text(separator="\n", strip=True) role_prefix = "**ChatGPT:**" if "chatgpt" in role.lower() else "**You:**" messages.append(f"{role_prefix}\n\n{content}") markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages) return filename, markdown if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python html_to_markdown.py ") sys.exit(1) input_html = sys.argv[1] if not os.path.isfile(input_html): print(f"File not found: {input_html}") sys.exit(1) output_name, markdown_text = convert_chat_html_to_markdown(input_html) output_path = os.path.join(os.path.dirname(input_html), output_name) with open(output_path, "w", encoding="utf-8") as f: f.write(markdown_text) print(f"Markdown saved to: {output_path}")