import sys import os from bs4 import BeautifulSoup, NavigableString def sanitize_filename(title: str) -> str: return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100] def get_text_with_formatting(element) -> str: def walk(node): if isinstance(node, NavigableString): return str(node) elif node.name in ("strong", "b"): return f"**{''.join(walk(c) for c in node.children)}**" elif node.name in ("em", "i"): return f"*{''.join(walk(c) for c in node.children)}*" elif node.name == "code": return f"`{''.join(walk(c) for c in node.children)}`" elif node.name == "a": href = node.get("href", "#") label = ''.join(walk(c) for c in node.children) return f"[{label}]({href})" elif node.name == "img": alt = node.get("alt", "") src = node.get("src", "") return f"![{alt}]({src})" else: return ''.join(walk(c) for c in node.children) return walk(element).strip() return "".join(result).strip() def extract_markdown_from_prose(prose: BeautifulSoup) -> str: md_lines = [] for element in prose.children: if isinstance(element, NavigableString): text = element.strip() if text: md_lines.append(text) continue tag = element.name # Headings if tag.startswith("h") and tag[1:].isdigit(): level = int(tag[1:]) md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}") # Paragraphs with possible links or inline formatting elif tag == "p": for link in element.find_all("a"): href = link.get("href", "#") link_text = link.get_text(strip=True) link.replace_with(f"[{link_text}]({href})") for code in element.find_all("code"): code_text = code.get_text() code.replace_with(f"`{code_text}`") md_lines.append(get_text_with_formatting(element)) # Preformatted code blocks elif tag == "pre": code = element.find("code") if code: lang_class = code.get("class", []) language = "" for cls in lang_class: if cls.startswith("language-"): language = cls.replace("language-", "") break code_text = code.get_text() md_lines.append(f"```{language}\n{code_text.strip()}\n```") # Images elif tag == "img": src = element.get("src", "") alt = element.get("alt", "") if src: md_lines.append(f"![{alt}]({src})") # Inline code elif tag == "code": code_text = get_text_with_formatting(element) md_lines.append(f"`{code_text}`") # Fallback else: text = get_text_with_formatting(element) if text: md_lines.append(text) return "\n\n".join(md_lines) def convert_chat_html_to_markdown(html_path: str) -> str: with open(html_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") title = soup.title.string.strip() if soup.title else "chatgpt_conversation" filename = sanitize_filename(title) + ".md" main = soup.find("main") if not main: raise ValueError("Could not find
in HTML. Is this a valid saved ChatGPT conversation?") h3_tags = main.find_all("h3") prose_blocks = main.find_all("div", class_="prose") messages = [] for h3, prose in zip(h3_tags, prose_blocks): role = h3.get_text(strip=True) if "chatgpt" in role.lower(): prefix = "**ChatGPT:**" else: prefix = "**You:**" body = extract_markdown_from_prose(prose).strip() if not body: continue messages.append(f"{prefix}\n\n{body}") markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages) return filename, markdown if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python html_to_markdown.py ") sys.exit(1) input_html = sys.argv[1] if not os.path.isfile(input_html): print(f"File not found: {input_html}") sys.exit(1) output_name, markdown_text = convert_chat_html_to_markdown(input_html) os.makedirs("output", exist_ok=True) output_path = os.path.join("output", output_name) with open(output_path, "w", encoding="utf-8") as f: f.write(markdown_text) print(f"Markdown saved to: {output_path}")