Browse Source

Fixes #1; Preserve new-lines

Josh Rosario 8 months ago
parent
commit
e3ddc37d51
1 changed files with 40 additions and 22 deletions
  1. 40 22
      main.py

+ 40 - 22
main.py

@@ -2,9 +2,11 @@ import sys
 import os
 from bs4 import BeautifulSoup, NavigableString
 
+
 def sanitize_filename(title: str) -> str:
     return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]
 
+
 def get_text_with_formatting(element) -> str:
     def walk(node):
         if isinstance(node, NavigableString):
@@ -35,14 +37,12 @@ def get_text_with_formatting(element) -> str:
     return walk(element).strip()
 
 
-    return "".join(result).strip()
-
-def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
+def extract_markdown_from_conversation(conversation: BeautifulSoup) -> str:
     md_lines = []
 
-    for element in prose.children:
+    for element in conversation.children:
         if isinstance(element, NavigableString):
-            text = element.strip()
+            text = element.strip().replace("\n", "  \n") # Preserve single spacing
             if text:
                 md_lines.append(text)
             continue
@@ -100,6 +100,7 @@ def extract_markdown_from_prose(prose: BeautifulSoup) -> str:
 
     return "\n\n".join(md_lines)
 
+
 def convert_chat_html_to_markdown(html_path: str) -> str:
     with open(html_path, "r", encoding="utf-8") as f:
         soup = BeautifulSoup(f, "html.parser")
@@ -111,36 +112,48 @@ def convert_chat_html_to_markdown(html_path: str) -> str:
     if not main:
         raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")
 
-    h3_tags = main.find_all("h3")
-    prose_blocks = main.find_all("div", class_="prose")
-
     messages = []
-    for h3, prose in zip(h3_tags, prose_blocks):
-        role = h3.get_text(strip=True)
-        if "chatgpt" in role.lower():
-            prefix = "**ChatGPT:**"
-        else:
-            prefix = "**You:**"
-
-        body = extract_markdown_from_prose(prose).strip()
+    conversation = main.find_all("div", attrs={'data-message-author-role': True})
+    for conversation_turn in conversation:
+              
+        # ChatGPT response div class="prose"
+        # User response div class="whitespace-pre-wrap"
+        content = conversation_turn.find("div", class_=["prose", "whitespace-pre-wrap"])
+        body = extract_markdown_from_conversation(content).strip()
         if not body:
             continue
-        messages.append(f"{prefix}\n\n{body}")
+
+        # # Preserve single spacing
+        # body = body.replace("\n", "  ") # Doesn't work
+
+        role = conversation_turn.get_attribute_list('data-message-author-role')  
+        if role[0] == "user":
+            message = f"**You:**\n\n{body}"
+        else:
+            message = f"**ChatGPT**\n\n{body}"
+        messages.append(message)
 
     markdown = f"# {title}\n\n" + "\n\n---\n\n".join(messages)
     return filename, markdown
 
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
+
+
+def main(args):
+    if len(args) != 2:
         print("Usage: python html_to_markdown.py <path_to_saved_html>")
         sys.exit(1)
 
-    input_html = sys.argv[1]
+    input_html = args[1]
     if not os.path.isfile(input_html):
-        print(f"File not found: {input_html}")
-        sys.exit(1)
+        input_html_ext = os.path.join("input", input_html)
+        if not os.path.isfile(input_html_ext):
+            print(f"File not found: {input_html}")
+            sys.exit(1)
+        else:
+            input_html = input_html_ext
 
     output_name, markdown_text = convert_chat_html_to_markdown(input_html)
+    
     os.makedirs("output", exist_ok=True)
     output_path = os.path.join("output", output_name)
 
@@ -148,3 +161,8 @@ if __name__ == "__main__":
         f.write(markdown_text)
 
     print(f"Markdown saved to: {output_path}")
+
+
+
+if __name__ == "__main__":
+    main(sys.argv)