Jelajahi Sumber

Improved formatting

Josh Rosario 8 bulan lalu
induk
melakukan
9b995cee7b
1 mengubah file dengan 28 tambahan dan 26 penghapusan
  1. 28 26
      main.py

+ 28 - 26
main.py

@@ -6,29 +6,36 @@ def sanitize_filename(title: str) -> str:
     return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
 
 def get_text_with_formatting(element) -> str:
-    result = []
+    def walk(node):
+        if isinstance(node, NavigableString):
+            return str(node)
 
-    for content in element.descendants:
-        if isinstance(content, NavigableString):
-            result.append(str(content))
+        elif node.name in ("strong", "b"):
+            return f"**{''.join(walk(c) for c in node.children)}**"
 
-        elif content.name in ("strong", "b"):
-            inner = "".join(content.strings)
-            result.append(f"**{inner}**")
+        elif node.name in ("em", "i"):
+            return f"*{''.join(walk(c) for c in node.children)}*"
 
-        elif content.name in ("em", "i"):
-            inner = "".join(content.strings)
-            result.append(f"*{inner}*")
+        elif node.name == "code":
+            return f"`{''.join(walk(c) for c in node.children)}`"
 
-        elif content.name == "code":
-            inner = "".join(content.strings)
-            result.append(f"`{inner}`")
+        elif node.name == "a":
+            href = node.get("href", "#")
+            label = ''.join(walk(c) for c in node.children)
+            return f"[{label}]({href})"
 
-        # Skip tags already replaced (like <a>, handled before calling this)
-        elif content.name in ("a", "img"):
-            continue
+        elif node.name == "img":
+            alt = node.get("alt", "")
+            src = node.get("src", "")
+            return f"![{alt}]({src})"
+
+        else:
+            return ''.join(walk(c) for c in node.children)
+
+    return walk(element).strip()
 
-    return " ".join(result).strip()
+
+    return "".join(result).strip()
 
 def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
     md_lines = []
@@ -60,14 +67,6 @@ def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
 
             md_lines.append(get_text_with_formatting(element))
 
-        # Lists
-        elif tag == "ul":
-            for li in element.find_all("li"):
-                md_lines.append(f"- {li.get_text(strip=True)}")
-        elif tag == "ol":
-            for idx, li in enumerate(element.find_all("li"), start=1):
-                md_lines.append(f"{idx}. {li.get_text(strip=True)}")
-
         # Preformatted code blocks
         elif tag == "pre":
             code = element.find("code")
@@ -123,7 +122,9 @@ def convert_chat_html_to_markdown(html_path: str) -> str:
         else:
             prefix = "**You:**"
 
-        body = extract_markdown_from_prose(prose)
+        body = extract_markdown_from_prose(prose).strip()
+        if not body:
+            continue
         messages.append(f"{prefix}\n\n{body}")
 
     markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
@@ -140,6 +141,7 @@ if __name__ == "__main__":
         sys.exit(1)
 
     output_name, markdown_text = convert_chat_html_to_markdown(input_html)
+    os.makedirs("output", exist_ok=True)
     output_path = os.path.join("output", output_name)
 
     with open(output_path, "w", encoding="utf-8") as f: