42 lines
1.5 KiB
Python

import sys
import re
import json
from markdownify import markdownify as md
def convert_html_to_markdown(input_file, output_file):
# Read the JSON content from the input file
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
markdown_output = []
for item in data:
title = item.get("title", "N/A")
page_id = item.get("pageID", "N/A")
page_link = item.get("pageLink", "N/A")
html_content = item.get("content", "")
# Convert HTML content to Markdown
markdown_content = md(html_content, strip=['a'])
# Remove unwanted image links (e.g., ![](...) )
markdown_content = re.sub(r'!\[.*?\]\(.*?\)', '', markdown_content)
# Prepend other attributes
markdown_output.append(f"# {title}\n\n")
markdown_output.append(f"**Page ID:** {page_id}\n")
markdown_output.append(f"**Page Link:** {page_link}\n\n")
markdown_output.append(markdown_content)
markdown_output.append("\n---\n\n") # Separator between entries
# Write the Markdown content to the output file
with open(output_file, 'w', encoding='utf-8') as f:
f.write("".join(markdown_output))
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python bs.py <input_json_file> <output_md_file>")
else:
input_file = sys.argv[1]
output_file = sys.argv[2]
convert_html_to_markdown(input_file, output_file)
print(f"HTML content converted to Markdown. Output written to {output_file}")