|
@@ -1,10 +1,106 @@
|
|
|
import sys
|
|
import sys
|
|
|
import os
|
|
import os
|
|
|
-from bs4 import BeautifulSoup
|
|
|
|
|
|
|
+from bs4 import BeautifulSoup, NavigableString
|
|
|
|
|
|
|
|
def sanitize_filename(title: str) -> str:
|
|
def sanitize_filename(title: str) -> str:
|
|
|
return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
|
|
return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
|
|
|
|
|
|
|
|
|
|
+def get_text_with_formatting(element) -> str:
|
|
|
|
|
+ result = []
|
|
|
|
|
+
|
|
|
|
|
+ for content in element.descendants:
|
|
|
|
|
+ if isinstance(content, NavigableString):
|
|
|
|
|
+ result.append(str(content))
|
|
|
|
|
+
|
|
|
|
|
+ elif content.name in ("strong", "b"):
|
|
|
|
|
+ inner = "".join(content.strings)
|
|
|
|
|
+ result.append(f"**{inner}**")
|
|
|
|
|
+
|
|
|
|
|
+ elif content.name in ("em", "i"):
|
|
|
|
|
+ inner = "".join(content.strings)
|
|
|
|
|
+ result.append(f"*{inner}*")
|
|
|
|
|
+
|
|
|
|
|
+ elif content.name == "code":
|
|
|
|
|
+ inner = "".join(content.strings)
|
|
|
|
|
+ result.append(f"`{inner}`")
|
|
|
|
|
+
|
|
|
|
|
+ # Skip tags already replaced (like <a>, handled before calling this)
|
|
|
|
|
+ elif content.name in ("a", "img"):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ return " ".join(result).strip()
|
|
|
|
|
+
|
|
|
|
|
+def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
|
|
|
|
|
+ md_lines = []
|
|
|
|
|
+
|
|
|
|
|
+ for element in prose.children:
|
|
|
|
|
+ if isinstance(element, NavigableString):
|
|
|
|
|
+ text = element.strip()
|
|
|
|
|
+ if text:
|
|
|
|
|
+ md_lines.append(text)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ tag = element.name
|
|
|
|
|
+
|
|
|
|
|
+ # Headings
|
|
|
|
|
+ if tag.startswith("h") and tag[1:].isdigit():
|
|
|
|
|
+ level = int(tag[1:])
|
|
|
|
|
+ md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}")
|
|
|
|
|
+
|
|
|
|
|
+ # Paragraphs with possible links or inline formatting
|
|
|
|
|
+ elif tag == "p":
|
|
|
|
|
+ for link in element.find_all("a"):
|
|
|
|
|
+ href = link.get("href", "#")
|
|
|
|
|
+ link_text = link.get_text(strip=True)
|
|
|
|
|
+ link.replace_with(f"[{link_text}]({href})")
|
|
|
|
|
+
|
|
|
|
|
+ for code in element.find_all("code"):
|
|
|
|
|
+ code_text = code.get_text()
|
|
|
|
|
+ code.replace_with(f"`{code_text}`")
|
|
|
|
|
+
|
|
|
|
|
+ md_lines.append(get_text_with_formatting(element))
|
|
|
|
|
+
|
|
|
|
|
+ # Lists
|
|
|
|
|
+ elif tag == "ul":
|
|
|
|
|
+ for li in element.find_all("li"):
|
|
|
|
|
+ md_lines.append(f"- {li.get_text(strip=True)}")
|
|
|
|
|
+ elif tag == "ol":
|
|
|
|
|
+ for idx, li in enumerate(element.find_all("li"), start=1):
|
|
|
|
|
+ md_lines.append(f"{idx}. {li.get_text(strip=True)}")
|
|
|
|
|
+
|
|
|
|
|
+ # Preformatted code blocks
|
|
|
|
|
+ elif tag == "pre":
|
|
|
|
|
+ code = element.find("code")
|
|
|
|
|
+ if code:
|
|
|
|
|
+ lang_class = code.get("class", [])
|
|
|
|
|
+ language = ""
|
|
|
|
|
+ for cls in lang_class:
|
|
|
|
|
+ if cls.startswith("language-"):
|
|
|
|
|
+ language = cls.replace("language-", "")
|
|
|
|
|
+ break
|
|
|
|
|
+ code_text = code.get_text()
|
|
|
|
|
+ md_lines.append(f"```{language}\n{code_text.strip()}\n```")
|
|
|
|
|
+
|
|
|
|
|
+ # Images
|
|
|
|
|
+ elif tag == "img":
|
|
|
|
|
+ src = element.get("src", "")
|
|
|
|
|
+ alt = element.get("alt", "")
|
|
|
|
|
+ if src:
|
|
|
|
|
+ md_lines.append(f"")
|
|
|
|
|
+
|
|
|
|
|
+ # Inline code
|
|
|
|
|
+ elif tag == "code":
|
|
|
|
|
+ code_text = get_text_with_formatting(element)
|
|
|
|
|
+ md_lines.append(f"`{code_text}`")
|
|
|
|
|
+
|
|
|
|
|
+ # Fallback
|
|
|
|
|
+ else:
|
|
|
|
|
+ text = get_text_with_formatting(element)
|
|
|
|
|
+ if text:
|
|
|
|
|
+ md_lines.append(text)
|
|
|
|
|
+
|
|
|
|
|
+ return "\n\n".join(md_lines)
|
|
|
|
|
+
|
|
|
def convert_chat_html_to_markdown(html_path: str) -> str:
|
|
def convert_chat_html_to_markdown(html_path: str) -> str:
|
|
|
with open(html_path, "r", encoding="utf-8") as f:
|
|
with open(html_path, "r", encoding="utf-8") as f:
|
|
|
soup = BeautifulSoup(f, "html.parser")
|
|
soup = BeautifulSoup(f, "html.parser")
|
|
@@ -12,24 +108,23 @@ def convert_chat_html_to_markdown(html_path: str) -> str:
|
|
|
title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
|
|
title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
|
|
|
filename = sanitize_filename(title) + ".md"
|
|
filename = sanitize_filename(title) + ".md"
|
|
|
|
|
|
|
|
- # Find the main chat container
|
|
|
|
|
- main_content = soup.find("main")
|
|
|
|
|
- if not main_content:
|
|
|
|
|
|
|
+ main = soup.find("main")
|
|
|
|
|
+ if not main:
|
|
|
raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
|
|
raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
|
|
|
|
|
|
|
|
- # Each message: role in h3, content in a sibling div
|
|
|
|
|
- h3s = main_content.find_all("h3")
|
|
|
|
|
- prose_divs = main_content.find_all("div", class_="prose")
|
|
|
|
|
-
|
|
|
|
|
- if len(h3s) != len(prose_divs):
|
|
|
|
|
- print("Warning: Number of roles and messages doesn't match. Continuing anyway...")
|
|
|
|
|
|
|
+ h3_tags = main.find_all("h3")
|
|
|
|
|
+ prose_blocks = main.find_all("div", class_="prose")
|
|
|
|
|
|
|
|
messages = []
|
|
messages = []
|
|
|
- for role_elem, content_elem in zip(h3s, prose_divs):
|
|
|
|
|
- role = role_elem.get_text(strip=True)
|
|
|
|
|
- content = content_elem.get_text(separator="\n", strip=True)
|
|
|
|
|
- role_prefix = "**ChatGPT:**" if "chatgpt" in role.lower() else "**You:**"
|
|
|
|
|
- messages.append(f"{role_prefix}\n\n{content}")
|
|
|
|
|
|
|
+ for h3, prose in zip(h3_tags, prose_blocks):
|
|
|
|
|
+ role = h3.get_text(strip=True)
|
|
|
|
|
+ if "chatgpt" in role.lower():
|
|
|
|
|
+ prefix = "**ChatGPT:**"
|
|
|
|
|
+ else:
|
|
|
|
|
+ prefix = "**You:**"
|
|
|
|
|
+
|
|
|
|
|
+ body = extract_markdown_from_prose(prose)
|
|
|
|
|
+ messages.append(f"{prefix}\n\n{body}")
|
|
|
|
|
|
|
|
markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
|
|
markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
|
|
|
return filename, markdown
|
|
return filename, markdown
|