import sys import re import json from markdownify import markdownify as md def convert_html_to_markdown(input_file, output_file): # Read the JSON content from the input file with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) markdown_output = [] for item in data: title = item.get("title", "N/A") page_id = item.get("pageID", "N/A") page_link = item.get("pageLink", "N/A") html_content = item.get("content", "") # Convert HTML content to Markdown markdown_content = md(html_content, strip=['a']) # Remove unwanted image links (e.g., ![](...) ) markdown_content = re.sub(r'!\[.*?\]\(.*?\)', '', markdown_content) # Prepend other attributes markdown_output.append(f"# {title}\n\n") markdown_output.append(f"**Page ID:** {page_id}\n") markdown_output.append(f"**Page Link:** {page_link}\n\n") markdown_output.append(markdown_content) markdown_output.append("\n---\n\n") # Separator between entries # Write the Markdown content to the output file with open(output_file, 'w', encoding='utf-8') as f: f.write("".join(markdown_output)) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python bs.py ") else: input_file = sys.argv[1] output_file = sys.argv[2] convert_html_to_markdown(input_file, output_file) print(f"HTML content converted to Markdown. Output written to {output_file}")