10 bulan lalu · 9b995cee7b
--- a/main.py
+++ b/main.py
@@ -6,29 +6,36 @@ def sanitize_filename(title: str) -> str:
 
				     return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
			
 
				 
			
 
				 def get_text_with_formatting(element) -> str:
			
 
				-    result = []
			
 
				+    def walk(node):
			
 
				+        if isinstance(node, NavigableString):
			
 
				+            return str(node)
			
 
				 
			
 
				-    for content in element.descendants:
			
 
				-        if isinstance(content, NavigableString):
			
 
				-            result.append(str(content))
			
 
				+        elif node.name in ("strong", "b"):
			
 
				+            return f"**{''.join(walk(c) for c in node.children)}**"
			
 
				 
			
 
				-        elif content.name in ("strong", "b"):
			
 
				-            inner = "".join(content.strings)
			
 
				-            result.append(f"**{inner}**")
			
 
				+        elif node.name in ("em", "i"):
			
 
				+            return f"*{''.join(walk(c) for c in node.children)}*"
			
 
				 
			
 
				-        elif content.name in ("em", "i"):
			
 
				-            inner = "".join(content.strings)
			
 
				-            result.append(f"*{inner}*")
			
 
				+        elif node.name == "code":
			
 
				+            return f"`{''.join(walk(c) for c in node.children)}`"
			
 
				 
			
 
				-        elif content.name == "code":
			
 
				-            inner = "".join(content.strings)
			
 
				-            result.append(f"`{inner}`")
			
 
				+        elif node.name == "a":
			
 
				+            href = node.get("href", "#")
			
 
				+            label = ''.join(walk(c) for c in node.children)
			
 
				+            return f"[{label}]({href})"
			
 
				 
			
 
				-        # Skip tags already replaced (like <a>, handled before calling this)
			
 
				-        elif content.name in ("a", "img"):
			
 
				-            continue
			
 
				+        elif node.name == "img":
			
 
				+            alt = node.get("alt", "")
			
 
				+            src = node.get("src", "")
			
 
				+            return f"![{alt}]({src})"
			
 
				+
			
 
				+        else:
			
 
				+            return ''.join(walk(c) for c in node.children)
			
 
				+
			
 
				+    return walk(element).strip()
			
 
				 
			
 
				-    return " ".join(result).strip()
			
 
				+
			
 
				+    return "".join(result).strip()
			
 
				 
			
 
				 def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
			
 
				     md_lines = []
			
@@ -60,14 +67,6 @@ def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
 
				 
			
 
				             md_lines.append(get_text_with_formatting(element))
			
 
				 
			
 
				-        # Lists
			
 
				-        elif tag == "ul":
			
 
				-            for li in element.find_all("li"):
			
 
				-                md_lines.append(f"- {li.get_text(strip=True)}")
			
 
				-        elif tag == "ol":
			
 
				-            for idx, li in enumerate(element.find_all("li"), start=1):
			
 
				-                md_lines.append(f"{idx}. {li.get_text(strip=True)}")
			
 
				-
			
 
				         # Preformatted code blocks
			
 
				         elif tag == "pre":
			
 
				             code = element.find("code")
			
@@ -123,7 +122,9 @@ def convert_chat_html_to_markdown(html_path: str) -> str:
 
				         else:
			
 
				             prefix = "**You:**"
			
 
				 
			
 
				-        body = extract_markdown_from_prose(prose)
			
 
				+        body = extract_markdown_from_prose(prose).strip()
			
 
				+        if not body:
			
 
				+            continue
			
 
				         messages.append(f"{prefix}\n\n{body}")
			
 
				 
			
 
				     markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
			
@@ -140,6 +141,7 @@ if __name__ == "__main__":
 
				         sys.exit(1)
			
 
				 
			
 
				     output_name, markdown_text = convert_chat_html_to_markdown(input_html)
			
 
				+    os.makedirs("output", exist_ok=True)
			
 
				     output_path = os.path.join("output", output_name)
			
 
				 
			
 
				     with open(output_path, "w", encoding="utf-8") as f: