Add requirements.txt with essential dependencies for NLP and data anonymization

2025-07-27 20:46:06 +02:00 · 2025-07-27 20:46:06 +02:00 · c81ce7cc57
commit c81ce7cc57
parent d20cf39e4a
10 changed files with 2391 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,79 @@
+# Operating System Files
+.DS_Store
+Thumbs.db
+.localized
+
+# IDE and Editor Files
+.vscode/
+.idea/
+*.iml
+*.ipr
+*.iws
+.project
+.classpath
+.settings/
+
+# Build Artifacts
+build/
+dist/
+target/
+*.log
+*.tmp
+*.bak
+*.swp
+*.swo
+
+# Python
+__pycache__/
+*.pyc
+*.pyd
+*.pyo
+venv/
+.venv/
+env/
+.env/
+pip-log.txt
+.Python
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+
+# Node.js
+node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-store/
+
+# Java
+*.class
+*.jar
+*.war
+*.ear
+*.zip
+*.tar.gz
+
+# C/C++
+*.o
+*.obj
+*.exe
+*.dll
+*.lib
+*.so
+*.dylib
+*.a
+*.out
+
+# Ruby
+.bundle/
+vendor/bundle/
+
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Windows
+Thumbs.db
+ehthumbs.db
+Desktop.ini
--- a/bs.py
+++ b/bs.py
@ -1,24 +1,38 @@
 import sys
-from bs4 import BeautifulSoup
+import json
+from markdownify import markdownify as md

-def remove_html_tags(input_file, output_file):
-    # Read the HTML content from the input file
+def convert_html_to_markdown(input_file, output_file):
+    # Read the JSON content from the input file
    with open(input_file, 'r', encoding='utf-8') as f:
-        html_content = f.read()
+        data = json.load(f)

-    # Use BeautifulSoup to parse and extract text
-    soup = BeautifulSoup(html_content, 'html.parser')
-    text = soup.get_text()
+    markdown_output = []
+    for item in data:
+        title = item.get("title", "N/A")
+        page_id = item.get("pageID", "N/A")
+        page_link = item.get("pageLink", "N/A")
+        html_content = item.get("content", "")

-    # Write the plain text to the output file
+        # Convert HTML content to Markdown
+        markdown_content = md(html_content)
+
+        # Prepend other attributes
+        markdown_output.append(f"# {title}\n\n")
+        markdown_output.append(f"**Page ID:** {page_id}\n")
+        markdown_output.append(f"**Page Link:** {page_link}\n\n")
+        markdown_output.append(markdown_content)
+        markdown_output.append("\n---\n\n") # Separator between entries
+
+    # Write the Markdown content to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
-        f.write(text)
+        f.write("".join(markdown_output))

 if __name__ == "__main__":
    if len(sys.argv) != 3:
-        print("Usage: python remove_html_tags.py <input_file> <output_file>")
+        print("Usage: python bs.py <input_json_file> <output_md_file>")
    else:
        input_file = sys.argv[1]
        output_file = sys.argv[2]
-        remove_html_tags(input_file, output_file)
-        print(f"HTML tags removed. Output written to {output_file}")
+        convert_html_to_markdown(input_file, output_file)
+        print(f"HTML content converted to Markdown. Output written to {output_file}")
--- a/data/HUB_DCR_html.json
+++ b/data/HUB_DCR_html.json
--- a/data/HUB_DCR_output.md
+++ b/data/HUB_DCR_output.md
--- a/data/HUB_dummy-data_test_clean.json
+++ b/data/HUB_dummy-data_test_clean.json
--- a/data/HUB_nohtml.txt
+++ b/data/HUB_nohtml.txt
--- a/data/custom
+++ b/data/custom
--- a/data/custom_output.json
+++ b/data/custom_output.json
--- a/data/output.txt
+++ b/data/output.txt
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+spacy
+presidio-analyzer
+presidio-anonymizer
+beautifulsoup4