10 місяців тому · d572cb3084
--- a/main.py
+++ b/main.py
@@ -1,10 +1,106 @@
 
				 import sys
			
 
				 import os
			
 
				-from bs4 import BeautifulSoup
			
 
				+from bs4 import BeautifulSoup, NavigableString
			
 
				 
			
 
				 def sanitize_filename(title: str) -> str:
			
 
				     return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
			
 
				 
			
 
				+def get_text_with_formatting(element) -> str:
			
 
				+    result = []
			
 
				+
			
 
				+    for content in element.descendants:
			
 
				+        if isinstance(content, NavigableString):
			
 
				+            result.append(str(content))
			
 
				+
			
 
				+        elif content.name in ("strong", "b"):
			
 
				+            inner = "".join(content.strings)
			
 
				+            result.append(f"**{inner}**")
			
 
				+
			
 
				+        elif content.name in ("em", "i"):
			
 
				+            inner = "".join(content.strings)
			
 
				+            result.append(f"*{inner}*")
			
 
				+
			
 
				+        elif content.name == "code":
			
 
				+            inner = "".join(content.strings)
			
 
				+            result.append(f"`{inner}`")
			
 
				+
			
 
				+        # Skip tags already replaced (like <a>, handled before calling this)
			
 
				+        elif content.name in ("a", "img"):
			
 
				+            continue
			
 
				+
			
 
				+    return " ".join(result).strip()
			
 
				+
			
 
				+def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
			
 
				+    md_lines = []
			
 
				+
			
 
				+    for element in prose.children:
			
 
				+        if isinstance(element, NavigableString):
			
 
				+            text = element.strip()
			
 
				+            if text:
			
 
				+                md_lines.append(text)
			
 
				+            continue
			
 
				+
			
 
				+        tag = element.name
			
 
				+
			
 
				+        # Headings
			
 
				+        if tag.startswith("h") and tag[1:].isdigit():
			
 
				+            level = int(tag[1:])
			
 
				+            md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}")
			
 
				+
			
 
				+        # Paragraphs with possible links or inline formatting
			
 
				+        elif tag == "p":
			
 
				+            for link in element.find_all("a"):
			
 
				+                href = link.get("href", "#")
			
 
				+                link_text = link.get_text(strip=True)
			
 
				+                link.replace_with(f"[{link_text}]({href})")
			
 
				+
			
 
				+            for code in element.find_all("code"):
			
 
				+                code_text = code.get_text()
			
 
				+                code.replace_with(f"`{code_text}`")
			
 
				+
			
 
				+            md_lines.append(get_text_with_formatting(element))
			
 
				+
			
 
				+        # Lists
			
 
				+        elif tag == "ul":
			
 
				+            for li in element.find_all("li"):
			
 
				+                md_lines.append(f"- {li.get_text(strip=True)}")
			
 
				+        elif tag == "ol":
			
 
				+            for idx, li in enumerate(element.find_all("li"), start=1):
			
 
				+                md_lines.append(f"{idx}. {li.get_text(strip=True)}")
			
 
				+
			
 
				+        # Preformatted code blocks
			
 
				+        elif tag == "pre":
			
 
				+            code = element.find("code")
			
 
				+            if code:
			
 
				+                lang_class = code.get("class", [])
			
 
				+                language = ""
			
 
				+                for cls in lang_class:
			
 
				+                    if cls.startswith("language-"):
			
 
				+                        language = cls.replace("language-", "")
			
 
				+                        break
			
 
				+                code_text = code.get_text()
			
 
				+                md_lines.append(f"```{language}\n{code_text.strip()}\n```")
			
 
				+
			
 
				+        # Images
			
 
				+        elif tag == "img":
			
 
				+            src = element.get("src", "")
			
 
				+            alt = element.get("alt", "")
			
 
				+            if src:
			
 
				+                md_lines.append(f"![{alt}]({src})")
			
 
				+
			
 
				+        # Inline code
			
 
				+        elif tag == "code":
			
 
				+            code_text = get_text_with_formatting(element)
			
 
				+            md_lines.append(f"`{code_text}`")
			
 
				+
			
 
				+        # Fallback
			
 
				+        else:
			
 
				+            text = get_text_with_formatting(element)
			
 
				+            if text:
			
 
				+                md_lines.append(text)
			
 
				+
			
 
				+    return "\n\n".join(md_lines)
			
 
				+
			
 
				 def convert_chat_html_to_markdown(html_path: str) -> str:
			
 
				     with open(html_path, "r", encoding="utf-8") as f:
			
 
				         soup = BeautifulSoup(f, "html.parser")
			
@@ -12,24 +108,23 @@ def convert_chat_html_to_markdown(html_path: str) -> str:
 
				     title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
			
 
				     filename = sanitize_filename(title) + ".md"
			
 
				 
			
 
				-    # Find the main chat container
			
 
				-    main_content = soup.find("main")
			
 
				-    if not main_content:
			
 
				+    main = soup.find("main")
			
 
				+    if not main:
			
 
				         raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
			
 
				 
			
 
				-    # Each message: role in h3, content in a sibling div
			
 
				-    h3s = main_content.find_all("h3")
			
 
				-    prose_divs = main_content.find_all("div", class_="prose")
			
 
				-
			
 
				-    if len(h3s) != len(prose_divs):
			
 
				-        print("Warning: Number of roles and messages doesn't match. Continuing anyway...")
			
 
				+    h3_tags = main.find_all("h3")
			
 
				+    prose_blocks = main.find_all("div", class_="prose")
			
 
				 
			
 
				     messages = []
			
 
				-    for role_elem, content_elem in zip(h3s, prose_divs):
			
 
				-        role = role_elem.get_text(strip=True)
			
 
				-        content = content_elem.get_text(separator="\n", strip=True)
			
 
				-        role_prefix = "**ChatGPT:**" if "chatgpt" in role.lower() else "**You:**"
			
 
				-        messages.append(f"{role_prefix}\n\n{content}")
			
 
				+    for h3, prose in zip(h3_tags, prose_blocks):
			
 
				+        role = h3.get_text(strip=True)
			
 
				+        if "chatgpt" in role.lower():
			
 
				+            prefix = "**ChatGPT:**"
			
 
				+        else:
			
 
				+            prefix = "**You:**"
			
 
				+
			
 
				+        body = extract_markdown_from_prose(prose)
			
 
				+        messages.append(f"{prefix}\n\n{body}")
			
 
				 
			
 
				     markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
			
 
				     return filename, markdown