|
|
@@ -2,9 +2,11 @@ import sys
|
|
|
import os
|
|
|
from bs4 import BeautifulSoup, NavigableString
|
|
|
|
|
|
+
|
|
|
def sanitize_filename(title: str) -> str:
|
|
|
return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
|
|
|
|
|
|
+
|
|
|
def get_text_with_formatting(element) -> str:
|
|
|
def walk(node):
|
|
|
if isinstance(node, NavigableString):
|
|
|
@@ -35,14 +37,12 @@ def get_text_with_formatting(element) -> str:
|
|
|
return walk(element).strip()
|
|
|
|
|
|
|
|
|
- return "".join(result).strip()
|
|
|
-
|
|
|
-def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
|
|
|
+def extract_markdown_from_conversation(conversation: BeautifulSoup) -> str:
|
|
|
md_lines = []
|
|
|
|
|
|
- for element in prose.children:
|
|
|
+ for element in conversation.children:
|
|
|
if isinstance(element, NavigableString):
|
|
|
- text = element.strip()
|
|
|
+ text = element.strip().replace("\n", " \n") # Preserve single spacing
|
|
|
if text:
|
|
|
md_lines.append(text)
|
|
|
continue
|
|
|
@@ -100,6 +100,7 @@ def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
|
|
|
|
|
|
return "\n\n".join(md_lines)
|
|
|
|
|
|
+
|
|
|
def convert_chat_html_to_markdown(html_path: str) -> str:
|
|
|
with open(html_path, "r", encoding="utf-8") as f:
|
|
|
soup = BeautifulSoup(f, "html.parser")
|
|
|
@@ -111,36 +112,48 @@ def convert_chat_html_to_markdown(html_path: str) -> str:
|
|
|
if not main:
|
|
|
raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
|
|
|
|
|
|
- h3_tags = main.find_all("h3")
|
|
|
- prose_blocks = main.find_all("div", class_="prose")
|
|
|
-
|
|
|
messages = []
|
|
|
- for h3, prose in zip(h3_tags, prose_blocks):
|
|
|
- role = h3.get_text(strip=True)
|
|
|
- if "chatgpt" in role.lower():
|
|
|
- prefix = "**ChatGPT:**"
|
|
|
- else:
|
|
|
- prefix = "**You:**"
|
|
|
-
|
|
|
- body = extract_markdown_from_prose(prose).strip()
|
|
|
+ conversation = main.find_all("div", attrs={'data-message-author-role': True})
|
|
|
+ for conversation_turn in conversation:
|
|
|
+
|
|
|
+ # ChatGPT response div class="prose"
|
|
|
+ # User response div class="whitespace-pre-wrap"
|
|
|
+ content = conversation_turn.find("div", class_=["prose", "whitespace-pre-wrap"])
|
|
|
+ body = extract_markdown_from_conversation(content).strip()
|
|
|
if not body:
|
|
|
continue
|
|
|
- messages.append(f"{prefix}\n\n{body}")
|
|
|
+
|
|
|
+ # # Preserve single spacing
|
|
|
+ # body = body.replace("\n", " ") # Doesn't work
|
|
|
+
|
|
|
+ role = conversation_turn.get_attribute_list('data-message-author-role')
|
|
|
+ if role[0] == "user":
|
|
|
+ message = f"**You:**\n\n{body}"
|
|
|
+ else:
|
|
|
+ message = f"**ChatGPT**\n\n{body}"
|
|
|
+ messages.append(message)
|
|
|
|
|
|
markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
|
|
|
return filename, markdown
|
|
|
|
|
|
-if __name__ == "__main__":
|
|
|
- if len(sys.argv) != 2:
|
|
|
+
|
|
|
+
|
|
|
+def main(args):
|
|
|
+ if len(args) != 2:
|
|
|
print("Usage: python html_to_markdown.py <path_to_saved_html>")
|
|
|
sys.exit(1)
|
|
|
|
|
|
- input_html = sys.argv[1]
|
|
|
+ input_html = args[1]
|
|
|
if not os.path.isfile(input_html):
|
|
|
- print(f"File not found: {input_html}")
|
|
|
- sys.exit(1)
|
|
|
+ input_html_ext = os.path.join("input", input_html)
|
|
|
+ if not os.path.isfile(input_html_ext):
|
|
|
+ print(f"File not found: {input_html}")
|
|
|
+ sys.exit(1)
|
|
|
+ else:
|
|
|
+ input_html = input_html_ext
|
|
|
|
|
|
output_name, markdown_text = convert_chat_html_to_markdown(input_html)
|
|
|
+
|
|
|
os.makedirs("output", exist_ok=True)
|
|
|
output_path = os.path.join("output", output_name)
|
|
|
|
|
|
@@ -148,3 +161,8 @@ if __name__ == "__main__":
|
|
|
f.write(markdown_text)
|
|
|
|
|
|
print(f"Markdown saved to: {output_path}")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main(sys.argv)
|