|
|
@@ -6,29 +6,36 @@ def sanitize_filename(title: str) -> str:
|
|
|
return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
|
|
|
|
|
|
def get_text_with_formatting(element) -> str:
|
|
|
- result = []
|
|
|
+ def walk(node):
|
|
|
+ if isinstance(node, NavigableString):
|
|
|
+ return str(node)
|
|
|
|
|
|
- for content in element.descendants:
|
|
|
- if isinstance(content, NavigableString):
|
|
|
- result.append(str(content))
|
|
|
+ elif node.name in ("strong", "b"):
|
|
|
+ return f"**{''.join(walk(c) for c in node.children)}**"
|
|
|
|
|
|
- elif content.name in ("strong", "b"):
|
|
|
- inner = "".join(content.strings)
|
|
|
- result.append(f"**{inner}**")
|
|
|
+ elif node.name in ("em", "i"):
|
|
|
+ return f"*{''.join(walk(c) for c in node.children)}*"
|
|
|
|
|
|
- elif content.name in ("em", "i"):
|
|
|
- inner = "".join(content.strings)
|
|
|
- result.append(f"*{inner}*")
|
|
|
+ elif node.name == "code":
|
|
|
+ return f"`{''.join(walk(c) for c in node.children)}`"
|
|
|
|
|
|
- elif content.name == "code":
|
|
|
- inner = "".join(content.strings)
|
|
|
- result.append(f"`{inner}`")
|
|
|
+ elif node.name == "a":
|
|
|
+ href = node.get("href", "#")
|
|
|
+ label = ''.join(walk(c) for c in node.children)
|
|
|
+ return f"[{label}]({href})"
|
|
|
|
|
|
- # Skip tags already replaced (like <a>, handled before calling this)
|
|
|
- elif content.name in ("a", "img"):
|
|
|
- continue
|
|
|
+ elif node.name == "img":
|
|
|
+ alt = node.get("alt", "")
|
|
|
+ src = node.get("src", "")
|
|
|
+ return f""
|
|
|
+
|
|
|
+ else:
|
|
|
+ return ''.join(walk(c) for c in node.children)
|
|
|
+
|
|
|
+ return walk(element).strip()
|
|
|
|
|
|
- return " ".join(result).strip()
|
|
|
+
|
|
|
+ return "".join(result).strip()
|
|
|
|
|
|
def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
|
|
|
md_lines = []
|
|
|
@@ -60,14 +67,6 @@ def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
|
|
|
|
|
|
md_lines.append(get_text_with_formatting(element))
|
|
|
|
|
|
- # Lists
|
|
|
- elif tag == "ul":
|
|
|
- for li in element.find_all("li"):
|
|
|
- md_lines.append(f"- {li.get_text(strip=True)}")
|
|
|
- elif tag == "ol":
|
|
|
- for idx, li in enumerate(element.find_all("li"), start=1):
|
|
|
- md_lines.append(f"{idx}. {li.get_text(strip=True)}")
|
|
|
-
|
|
|
# Preformatted code blocks
|
|
|
elif tag == "pre":
|
|
|
code = element.find("code")
|
|
|
@@ -123,7 +122,9 @@ def convert_chat_html_to_markdown(html_path: str) -> str:
|
|
|
else:
|
|
|
prefix = "**You:**"
|
|
|
|
|
|
- body = extract_markdown_from_prose(prose)
|
|
|
+ body = extract_markdown_from_prose(prose).strip()
|
|
|
+ if not body:
|
|
|
+ continue
|
|
|
messages.append(f"{prefix}\n\n{body}")
|
|
|
|
|
|
markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
|
|
|
@@ -140,6 +141,7 @@ if __name__ == "__main__":
|
|
|
sys.exit(1)
|
|
|
|
|
|
output_name, markdown_text = convert_chat_html_to_markdown(input_html)
|
|
|
+ os.makedirs("output", exist_ok=True)
|
|
|
output_path = os.path.join("output", output_name)
|
|
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f:
|