| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- import sys
- import os
- from bs4 import BeautifulSoup, NavigableString
- def sanitize_filename(title: str) -> str:
- return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
- def get_text_with_formatting(element) -> str:
- result = []
- for content in element.descendants:
- if isinstance(content, NavigableString):
- result.append(str(content))
- elif content.name in ("strong", "b"):
- inner = "".join(content.strings)
- result.append(f"**{inner}**")
- elif content.name in ("em", "i"):
- inner = "".join(content.strings)
- result.append(f"*{inner}*")
- elif content.name == "code":
- inner = "".join(content.strings)
- result.append(f"`{inner}`")
- # Skip tags already replaced (like <a>, handled before calling this)
- elif content.name in ("a", "img"):
- continue
- return " ".join(result).strip()
- def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
- md_lines = []
- for element in prose.children:
- if isinstance(element, NavigableString):
- text = element.strip()
- if text:
- md_lines.append(text)
- continue
- tag = element.name
- # Headings
- if tag.startswith("h") and tag[1:].isdigit():
- level = int(tag[1:])
- md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}")
- # Paragraphs with possible links or inline formatting
- elif tag == "p":
- for link in element.find_all("a"):
- href = link.get("href", "#")
- link_text = link.get_text(strip=True)
- link.replace_with(f"[{link_text}]({href})")
- for code in element.find_all("code"):
- code_text = code.get_text()
- code.replace_with(f"`{code_text}`")
- md_lines.append(get_text_with_formatting(element))
- # Lists
- elif tag == "ul":
- for li in element.find_all("li"):
- md_lines.append(f"- {li.get_text(strip=True)}")
- elif tag == "ol":
- for idx, li in enumerate(element.find_all("li"), start=1):
- md_lines.append(f"{idx}. {li.get_text(strip=True)}")
- # Preformatted code blocks
- elif tag == "pre":
- code = element.find("code")
- if code:
- lang_class = code.get("class", [])
- language = ""
- for cls in lang_class:
- if cls.startswith("language-"):
- language = cls.replace("language-", "")
- break
- code_text = code.get_text()
- md_lines.append(f"```{language}\n{code_text.strip()}\n```")
- # Images
- elif tag == "img":
- src = element.get("src", "")
- alt = element.get("alt", "")
- if src:
- md_lines.append(f"")
- # Inline code
- elif tag == "code":
- code_text = get_text_with_formatting(element)
- md_lines.append(f"`{code_text}`")
- # Fallback
- else:
- text = get_text_with_formatting(element)
- if text:
- md_lines.append(text)
- return "\n\n".join(md_lines)
- def convert_chat_html_to_markdown(html_path: str) -> str:
- with open(html_path, "r", encoding="utf-8") as f:
- soup = BeautifulSoup(f, "html.parser")
- title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
- filename = sanitize_filename(title) + ".md"
- main = soup.find("main")
- if not main:
- raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
- h3_tags = main.find_all("h3")
- prose_blocks = main.find_all("div", class_="prose")
- messages = []
- for h3, prose in zip(h3_tags, prose_blocks):
- role = h3.get_text(strip=True)
- if "chatgpt" in role.lower():
- prefix = "**ChatGPT:**"
- else:
- prefix = "**You:**"
- body = extract_markdown_from_prose(prose)
- messages.append(f"{prefix}\n\n{body}")
- markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
- return filename, markdown
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- print("Usage: python html_to_markdown.py <path_to_saved_html>")
- sys.exit(1)
- input_html = sys.argv[1]
- if not os.path.isfile(input_html):
- print(f"File not found: {input_html}")
- sys.exit(1)
- output_name, markdown_text = convert_chat_html_to_markdown(input_html)
- output_path = os.path.join("output", output_name)
- with open(output_path, "w", encoding="utf-8") as f:
- f.write(markdown_text)
- print(f"Markdown saved to: {output_path}")
|