Add requirements.txt with essential dependencies for NLP and data anonymization
This commit is contained in:
parent
d20cf39e4a
commit
c81ce7cc57
79
.gitignore
vendored
Normal file
79
.gitignore
vendored
Normal file
@ -0,0 +1,79 @@
|
||||
# Operating System Files
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
.localized
|
||||
|
||||
# IDE and Editor Files
|
||||
.vscode/
|
||||
.idea/
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
.project
|
||||
.classpath
|
||||
.settings/
|
||||
|
||||
# Build Artifacts
|
||||
build/
|
||||
dist/
|
||||
target/
|
||||
*.log
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyd
|
||||
*.pyo
|
||||
venv/
|
||||
.venv/
|
||||
env/
|
||||
.env/
|
||||
pip-log.txt
|
||||
.Python
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
|
||||
# Node.js
|
||||
node_modules/
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
.pnpm-store/
|
||||
|
||||
# Java
|
||||
*.class
|
||||
*.jar
|
||||
*.war
|
||||
*.ear
|
||||
*.zip
|
||||
*.tar.gz
|
||||
|
||||
# C/C++
|
||||
*.o
|
||||
*.obj
|
||||
*.exe
|
||||
*.dll
|
||||
*.lib
|
||||
*.so
|
||||
*.dylib
|
||||
*.a
|
||||
*.out
|
||||
|
||||
# Ruby
|
||||
.bundle/
|
||||
vendor/bundle/
|
||||
|
||||
# macOS
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Windows
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
Desktop.ini
|
||||
38
bs.py
38
bs.py
@ -1,24 +1,38 @@
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
from markdownify import markdownify as md
|
||||
|
||||
def remove_html_tags(input_file, output_file):
|
||||
# Read the HTML content from the input file
|
||||
def convert_html_to_markdown(input_file, output_file):
|
||||
# Read the JSON content from the input file
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
html_content = f.read()
|
||||
data = json.load(f)
|
||||
|
||||
# Use BeautifulSoup to parse and extract text
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
text = soup.get_text()
|
||||
markdown_output = []
|
||||
for item in data:
|
||||
title = item.get("title", "N/A")
|
||||
page_id = item.get("pageID", "N/A")
|
||||
page_link = item.get("pageLink", "N/A")
|
||||
html_content = item.get("content", "")
|
||||
|
||||
# Write the plain text to the output file
|
||||
# Convert HTML content to Markdown
|
||||
markdown_content = md(html_content)
|
||||
|
||||
# Prepend other attributes
|
||||
markdown_output.append(f"# {title}\n\n")
|
||||
markdown_output.append(f"**Page ID:** {page_id}\n")
|
||||
markdown_output.append(f"**Page Link:** {page_link}\n\n")
|
||||
markdown_output.append(markdown_content)
|
||||
markdown_output.append("\n---\n\n") # Separator between entries
|
||||
|
||||
# Write the Markdown content to the output file
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(text)
|
||||
f.write("".join(markdown_output))
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python remove_html_tags.py <input_file> <output_file>")
|
||||
print("Usage: python bs.py <input_json_file> <output_md_file>")
|
||||
else:
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2]
|
||||
remove_html_tags(input_file, output_file)
|
||||
print(f"HTML tags removed. Output written to {output_file}")
|
||||
convert_html_to_markdown(input_file, output_file)
|
||||
print(f"HTML content converted to Markdown. Output written to {output_file}")
|
||||
|
||||
152
data/HUB_DCR_html.json
Normal file
152
data/HUB_DCR_html.json
Normal file
File diff suppressed because one or more lines are too long
2130
data/HUB_DCR_output.md
Normal file
2130
data/HUB_DCR_output.md
Normal file
File diff suppressed because it is too large
Load Diff
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
spacy
|
||||
presidio-analyzer
|
||||
presidio-anonymizer
|
||||
beautifulsoup4
|
||||
Loading…
x
Reference in New Issue
Block a user