Add requirements.txt with essential dependencies for NLP and data anonymization
This commit is contained in:
parent
d20cf39e4a
commit
c81ce7cc57
79
.gitignore
vendored
Normal file
79
.gitignore
vendored
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
# Operating System Files
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
.localized
|
||||||
|
|
||||||
|
# IDE and Editor Files
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.iml
|
||||||
|
*.ipr
|
||||||
|
*.iws
|
||||||
|
.project
|
||||||
|
.classpath
|
||||||
|
.settings/
|
||||||
|
|
||||||
|
# Build Artifacts
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
target/
|
||||||
|
*.log
|
||||||
|
*.tmp
|
||||||
|
*.bak
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyd
|
||||||
|
*.pyo
|
||||||
|
venv/
|
||||||
|
.venv/
|
||||||
|
env/
|
||||||
|
.env/
|
||||||
|
pip-log.txt
|
||||||
|
.Python
|
||||||
|
.pytest_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# Node.js
|
||||||
|
node_modules/
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
.pnpm-store/
|
||||||
|
|
||||||
|
# Java
|
||||||
|
*.class
|
||||||
|
*.jar
|
||||||
|
*.war
|
||||||
|
*.ear
|
||||||
|
*.zip
|
||||||
|
*.tar.gz
|
||||||
|
|
||||||
|
# C/C++
|
||||||
|
*.o
|
||||||
|
*.obj
|
||||||
|
*.exe
|
||||||
|
*.dll
|
||||||
|
*.lib
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.a
|
||||||
|
*.out
|
||||||
|
|
||||||
|
# Ruby
|
||||||
|
.bundle/
|
||||||
|
vendor/bundle/
|
||||||
|
|
||||||
|
# macOS
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
|
||||||
|
# Windows
|
||||||
|
Thumbs.db
|
||||||
|
ehthumbs.db
|
||||||
|
Desktop.ini
|
||||||
38
bs.py
38
bs.py
@ -1,24 +1,38 @@
|
|||||||
import sys
|
import sys
|
||||||
from bs4 import BeautifulSoup
|
import json
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
|
||||||
def remove_html_tags(input_file, output_file):
|
def convert_html_to_markdown(input_file, output_file):
|
||||||
# Read the HTML content from the input file
|
# Read the JSON content from the input file
|
||||||
with open(input_file, 'r', encoding='utf-8') as f:
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
html_content = f.read()
|
data = json.load(f)
|
||||||
|
|
||||||
# Use BeautifulSoup to parse and extract text
|
markdown_output = []
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
for item in data:
|
||||||
text = soup.get_text()
|
title = item.get("title", "N/A")
|
||||||
|
page_id = item.get("pageID", "N/A")
|
||||||
|
page_link = item.get("pageLink", "N/A")
|
||||||
|
html_content = item.get("content", "")
|
||||||
|
|
||||||
# Write the plain text to the output file
|
# Convert HTML content to Markdown
|
||||||
|
markdown_content = md(html_content)
|
||||||
|
|
||||||
|
# Prepend other attributes
|
||||||
|
markdown_output.append(f"# {title}\n\n")
|
||||||
|
markdown_output.append(f"**Page ID:** {page_id}\n")
|
||||||
|
markdown_output.append(f"**Page Link:** {page_link}\n\n")
|
||||||
|
markdown_output.append(markdown_content)
|
||||||
|
markdown_output.append("\n---\n\n") # Separator between entries
|
||||||
|
|
||||||
|
# Write the Markdown content to the output file
|
||||||
with open(output_file, 'w', encoding='utf-8') as f:
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
f.write(text)
|
f.write("".join(markdown_output))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
print("Usage: python remove_html_tags.py <input_file> <output_file>")
|
print("Usage: python bs.py <input_json_file> <output_md_file>")
|
||||||
else:
|
else:
|
||||||
input_file = sys.argv[1]
|
input_file = sys.argv[1]
|
||||||
output_file = sys.argv[2]
|
output_file = sys.argv[2]
|
||||||
remove_html_tags(input_file, output_file)
|
convert_html_to_markdown(input_file, output_file)
|
||||||
print(f"HTML tags removed. Output written to {output_file}")
|
print(f"HTML content converted to Markdown. Output written to {output_file}")
|
||||||
|
|||||||
152
data/HUB_DCR_html.json
Normal file
152
data/HUB_DCR_html.json
Normal file
File diff suppressed because one or more lines are too long
2130
data/HUB_DCR_output.md
Normal file
2130
data/HUB_DCR_output.md
Normal file
File diff suppressed because it is too large
Load Diff
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
spacy
|
||||||
|
presidio-analyzer
|
||||||
|
presidio-anonymizer
|
||||||
|
beautifulsoup4
|
||||||
Loading…
x
Reference in New Issue
Block a user