anonymizer/bs.py

import sys
import re
import json
from markdownify import markdownify as md

def convert_html_to_markdown(input_file, output_file):
    # Read the JSON content from the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    markdown_output = []
    for item in data:
        title = item.get("title", "N/A")
        page_id = item.get("pageID", "N/A")
        page_link = item.get("pageLink", "N/A")
        html_content = item.get("content", "")

        # Convert HTML content to Markdown
        markdown_content = md(html_content, strip=['a'])
        # Remove unwanted image links (e.g., ![](...) )
        markdown_content = re.sub(r'!\[.*?\]\(.*?\)', '', markdown_content)

        # Prepend other attributes
        markdown_output.append(f"# {title}\n\n")
        markdown_output.append(f"**Page ID:** {page_id}\n")
        markdown_output.append(f"**Page Link:** {page_link}\n\n")
        markdown_output.append(markdown_content)
        markdown_output.append("\n---\n\n") # Separator between entries

    # Write the Markdown content to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("".join(markdown_output))

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python bs.py <input_json_file> <output_md_file>")
    else:
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        convert_html_to_markdown(input_file, output_file)
        print(f"HTML content converted to Markdown. Output written to {output_file}")