Add requirements.txt with essential dependencies for NLP and data anonymization

This commit is contained in:
Ireneusz Bachanowicz 2025-07-27 20:46:06 +02:00
parent d20cf39e4a
commit c81ce7cc57
10 changed files with 2391 additions and 12 deletions

79
.gitignore vendored Normal file
View File

@ -0,0 +1,79 @@
# Operating System Files
.DS_Store
Thumbs.db
.localized
# IDE and Editor Files
.vscode/
.idea/
*.iml
*.ipr
*.iws
.project
.classpath
.settings/
# Build Artifacts
build/
dist/
target/
*.log
*.tmp
*.bak
*.swp
*.swo
# Python
__pycache__/
*.pyc
*.pyd
*.pyo
venv/
.venv/
env/
.env/
pip-log.txt
.Python
.pytest_cache/
.mypy_cache/
.ruff_cache/
# Node.js
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-store/
# Java
*.class
*.jar
*.war
*.ear
*.zip
*.tar.gz
# C/C++
*.o
*.obj
*.exe
*.dll
*.lib
*.so
*.dylib
*.a
*.out
# Ruby
.bundle/
vendor/bundle/
# macOS
.DS_Store
.AppleDouble
.LSOverride
# Windows
Thumbs.db
ehthumbs.db
Desktop.ini

38
bs.py
View File

@ -1,24 +1,38 @@
import sys
from bs4 import BeautifulSoup
import json
from markdownify import markdownify as md
def remove_html_tags(input_file, output_file):
# Read the HTML content from the input file
def convert_html_to_markdown(input_file, output_file):
# Read the JSON content from the input file
with open(input_file, 'r', encoding='utf-8') as f:
html_content = f.read()
data = json.load(f)
# Use BeautifulSoup to parse and extract text
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text()
markdown_output = []
for item in data:
title = item.get("title", "N/A")
page_id = item.get("pageID", "N/A")
page_link = item.get("pageLink", "N/A")
html_content = item.get("content", "")
# Write the plain text to the output file
# Convert HTML content to Markdown
markdown_content = md(html_content)
# Prepend other attributes
markdown_output.append(f"# {title}\n\n")
markdown_output.append(f"**Page ID:** {page_id}\n")
markdown_output.append(f"**Page Link:** {page_link}\n\n")
markdown_output.append(markdown_content)
markdown_output.append("\n---\n\n") # Separator between entries
# Write the Markdown content to the output file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(text)
f.write("".join(markdown_output))
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python remove_html_tags.py <input_file> <output_file>")
print("Usage: python bs.py <input_json_file> <output_md_file>")
else:
input_file = sys.argv[1]
output_file = sys.argv[2]
remove_html_tags(input_file, output_file)
print(f"HTML tags removed. Output written to {output_file}")
convert_html_to_markdown(input_file, output_file)
print(f"HTML content converted to Markdown. Output written to {output_file}")

152
data/HUB_DCR_html.json Normal file

File diff suppressed because one or more lines are too long

2130
data/HUB_DCR_output.md Normal file

File diff suppressed because it is too large Load Diff

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
spacy
presidio-analyzer
presidio-anonymizer
beautifulsoup4