42 lines
1.5 KiB
Python
42 lines
1.5 KiB
Python
import sys
|
|
import re
|
|
import json
|
|
from markdownify import markdownify as md
|
|
|
|
def convert_html_to_markdown(input_file, output_file):
|
|
# Read the JSON content from the input file
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
markdown_output = []
|
|
for item in data:
|
|
title = item.get("title", "N/A")
|
|
page_id = item.get("pageID", "N/A")
|
|
page_link = item.get("pageLink", "N/A")
|
|
html_content = item.get("content", "")
|
|
|
|
# Convert HTML content to Markdown
|
|
markdown_content = md(html_content, strip=['a'])
|
|
# Remove unwanted image links (e.g.,  )
|
|
markdown_content = re.sub(r'!\[.*?\]\(.*?\)', '', markdown_content)
|
|
|
|
# Prepend other attributes
|
|
markdown_output.append(f"# {title}\n\n")
|
|
markdown_output.append(f"**Page ID:** {page_id}\n")
|
|
markdown_output.append(f"**Page Link:** {page_link}\n\n")
|
|
markdown_output.append(markdown_content)
|
|
markdown_output.append("\n---\n\n") # Separator between entries
|
|
|
|
# Write the Markdown content to the output file
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("".join(markdown_output))
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 3:
|
|
print("Usage: python bs.py <input_json_file> <output_md_file>")
|
|
else:
|
|
input_file = sys.argv[1]
|
|
output_file = sys.argv[2]
|
|
convert_html_to_markdown(input_file, output_file)
|
|
print(f"HTML content converted to Markdown. Output written to {output_file}")
|