josh
/
ChatGPT-to-Markdown


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
							import sys
import os
from datetime import datetime, timezone
from bs4 import BeautifulSoup, NavigableString


def sanitize_filename(title: str) -> str:
    return "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in title).strip()[:100]


def get_text_with_formatting(element) -> str:
    def walk(node):
        if isinstance(node, NavigableString):
            return str(node)

        elif node.name in ("strong", "b"):
            return f"**{''.join(walk(c) for c in node.children)}**"

        elif node.name in ("em", "i"):
            return f"*{''.join(walk(c) for c in node.children)}*"

        elif node.name == "code":
            return f"`{''.join(walk(c) for c in node.children)}`"

        elif node.name == "a":
            href = node.get("href", "#")
            label = ''.join(walk(c) for c in node.children)
            return f"[{label}]({href})"

        elif node.name == "img":
            alt = node.get("alt", "")
            src = node.get("src", "")
            return f"![{alt}]({src})"

        else:
            return ''.join(walk(c) for c in node.children)

    return walk(element).strip()


def extract_markdown_from_conversation(conversation: BeautifulSoup) -> str:
    md_lines = []

    for element in conversation.children:
        if isinstance(element, NavigableString):
            text = element.strip().replace("\n", "  \n") # Preserve single spacing
            text = text.replace("<", "\<").replace(">", "\>") # Escape HTML tags that aren't in Code blocks
            if text:
                md_lines.append(text)
            continue

        tag = element.name

        # Headings
        if tag.startswith("h") and tag[1:].isdigit():
            level = int(tag[1:])
            md_lines.append(f"{'#' * level} {get_text_with_formatting(element)}")

        # Paragraphs with possible links or inline formatting
        elif tag == "p":
            for link in element.find_all("a"):
                href = link.get("href", "#")
                link_text = link.get_text(strip=True)
                link.replace_with(f"[{link_text}]({href})")

            for code in element.find_all("code"):
                code_text = code.get_text()
                code.replace_with(f"`{code_text}`")

            md_lines.append(get_text_with_formatting(element))

        # Preformatted code blocks
        elif tag == "pre":
            code = element.find("code")
            if code:
                lang_class = code.get("class", [])
                language = ""
                for cls in lang_class:
                    if cls.startswith("language-"):
                        language = cls.replace("language-", "")
                        break
                code_text = code.get_text()
                md_lines.append(f"```{language}\n{code_text.strip()}\n```")

        # Images
        elif tag == "img":
            src = element.get("src", "")
            alt = element.get("alt", "")
            if src:
                md_lines.append(f"![{alt}]({src})")

        # Inline code
        elif tag == "code":
            code_text = get_text_with_formatting(element)
            md_lines.append(f"`{code_text}`")

        # Tables
        elif tag == "table":
            def extract_rows(section):
                return section.find_all("tr") if section else []

            thead = element.find("thead")
            tbody = element.find("tbody")
            tfoot = element.find("tfoot")

            rows = extract_rows(thead) + extract_rows(tbody) + extract_rows(element) + extract_rows(tfoot)
            seen = set()
            filtered_rows = []
            for tr in rows:
                if tr not in seen:
                    seen.add(tr)
                    filtered_rows.append(tr)

            if not filtered_rows:
                continue

            table_lines = []
            for row_idx, tr in enumerate(filtered_rows):
                cells = tr.find_all(["th", "td"])
                row = [get_text_with_formatting(cell).strip() for cell in cells]
                line = "| " + " | ".join(row) + " |"
                table_lines.append(line)

                # After the first row, add separator (assuming it's the header)
                if row_idx == 0:
                    separator = "| " + " | ".join(["---"] * len(row)) + " |"
                    table_lines.insert(1, separator)

            md_lines.append("\n".join(table_lines))

        # Fallback
        else:
            text = get_text_with_formatting(element)
            if text:
                md_lines.append(text)

    return "\n\n".join(md_lines)


def convert_chat_html_to_markdown(html_path: str) -> str:
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    title = soup.title.string.strip() if soup.title else "chatgpt_conversation"
    filename = sanitize_filename(title) + ".md"

    main = soup.find("main")
    if not main:
        raise ValueError("Could not find <main> in HTML. Is this a valid saved ChatGPT conversation?")

    messages = []
    conversation = main.find_all("div", attrs={'data-message-author-role': True})
    for conversation_turn in conversation:
              
        # ChatGPT response div class="prose"
        # User response div class="whitespace-pre-wrap"
        content = conversation_turn.find("div", class_=["prose", "whitespace-pre-wrap"])
        body = extract_markdown_from_conversation(content).strip()
        if not body:
            continue

        role = conversation_turn.get_attribute_list('data-message-author-role')  
        if role[0] == "user":
            message = f"**You:**\n\n{body}"
        else:
            message = f"**ChatGPT**\n\n{body}"
        messages.append(message)

    dd_trace_time = int(int(soup.find("meta", {"name":"dd-trace-time"}).attrs["content"]) / 1000)  # UTC time
    timestamp = datetime.fromtimestamp(dd_trace_time).strftime("%c")
    header = f"*ChatGPT conversation saved {timestamp} converted to Markdown*"

    markdown = f"# {title}\n\n{header}\n\n---\n\n" + "\n\n---\n\n".join(messages)
    return filename, markdown


def main(args):
    if len(args) != 2:
        print("Usage: python html_to_markdown.py <path_to_saved_html>")
        sys.exit(1)

    input_html = args[1]
    if not os.path.isfile(input_html):
        input_html_ext = os.path.join("input", input_html)
        if not os.path.isfile(input_html_ext):
            print(f"File not found: {input_html}")
            sys.exit(1)
        else:
            input_html = input_html_ext

    output_name, markdown_text = convert_chat_html_to_markdown(input_html)

    os.makedirs("output", exist_ok=True)
    output_path = os.path.join("output", output_name)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(markdown_text)

    print(f"Markdown saved to: {output_path}")


if __name__ == "__main__":
    main(sys.argv)