import sys import os from bs4 import BeautifulSoup, NavigableString def sanitize_filename(title: str) -> str: return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100] def get_text_with_formatting(element) -> str: result = [] for content in element.descendants: if isinstance(content, NavigableString): result.append(str(content)) elif content.name in ("strong", "b"): inner = "".join(content.strings) result.append(f"**{inner}**") elif content.name in ("em", "i"): inner = "".join(content.strings) result.append(f"*{inner}*") elif content.name == "code": inner = "".join(content.strings) result.append(f"`{inner}`") # Skip tags already replaced (like , handled before calling this) elif content.name in ("a", "img"): continue return " ".join(result).strip() def extract_markdown_from_prose(prose: BeautifulSoup) -> str: md_lines = [] for element in prose.children: if isinstance(element, NavigableString): text = element.strip() if text: md_lines.append(text) continue tag = element.name # Headings if tag.startswith("h") and tag[1:].isdigit(): level = int(tag[1:]) md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}") # Paragraphs with possible links or inline formatting elif tag == "p": for link in element.find_all("a"): href = link.get("href", "#") link_text = link.get_text(strip=True) link.replace_with(f"[{link_text}]({href})") for code in element.find_all("code"): code_text = code.get_text() code.replace_with(f"`{code_text}`") md_lines.append(get_text_with_formatting(element)) # Lists elif tag == "ul": for li in element.find_all("li"): md_lines.append(f"- {li.get_text(strip=True)}") elif tag == "ol": for idx, li in enumerate(element.find_all("li"), start=1): md_lines.append(f"{idx}. {li.get_text(strip=True)}") # Preformatted code blocks elif tag == "pre": code = element.find("code") if code: lang_class = code.get("class", []) language = "" for cls in lang_class: if cls.startswith("language-"): language = cls.replace("language-", "") break code_text = code.get_text() md_lines.append(f"```{language}\n{code_text.strip()}\n```") # Images elif tag == "img": src = element.get("src", "") alt = element.get("alt", "") if src: md_lines.append(f"![{alt}]({src})") # Inline code elif tag == "code": code_text = get_text_with_formatting(element) md_lines.append(f"`{code_text}`") # Fallback else: text = get_text_with_formatting(element) if text: md_lines.append(text) return "\n\n".join(md_lines) def convert_chat_html_to_markdown(html_path: str) -> str: with open(html_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") title = soup.title.string.strip() if soup.title else "chatgpt_conversation" filename = sanitize_filename(title) + ".md" main = soup.find("main") if not main: raise ValueError("Could not find
in HTML. Is this a valid saved ChatGPT conversation?") h3_tags = main.find_all("h3") prose_blocks = main.find_all("div", class_="prose") messages = [] for h3, prose in zip(h3_tags, prose_blocks): role = h3.get_text(strip=True) if "chatgpt" in role.lower(): prefix = "**ChatGPT:**" else: prefix = "**You:**" body = extract_markdown_from_prose(prose) messages.append(f"{prefix}\n\n{body}") markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages) return filename, markdown if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python html_to_markdown.py ") sys.exit(1) input_html = sys.argv[1] if not os.path.isfile(input_html): print(f"File not found: {input_html}") sys.exit(1) output_name, markdown_text = convert_chat_html_to_markdown(input_html) output_path = os.path.join("output", output_name) with open(output_path, "w", encoding="utf-8") as f: f.write(markdown_text) print(f"Markdown saved to: {output_path}")