anonymizer/strip.py
Ireneusz Bachanowicz d20cf39e4a first commit
2025-07-14 17:12:50 +02:00

62 lines
2.3 KiB
Python

import json
from bs4 import BeautifulSoup
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
def redact_text(text_content):
# Analyze the text for PII
results = analyzer.analyze(text=text_content, language='en')
# Anonymize the detected PII
anonymized_result = anonymizer.anonymize(text=text_content, analyzer_results=results)
return anonymized_result.text
def process_json_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Recursive function to redact strings in JSON
def redact_json_values(obj):
if isinstance(obj, dict):
return {k: redact_json_values(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [redact_json_values(elem) for elem in obj]
elif isinstance(obj, str):
return redact_text(obj)
else:
return obj
redacted_data = redact_json_values(data)
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
json.dump(redacted_data, f, indent=2)
print(f"Redacted JSON saved to redacted_{file_path}")
def process_html_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# Find all text nodes (excluding script and style)
for text_node in soup.find_all(string=True):
if text_node.parent.name not in ['script', 'style', 'head', 'meta', '[document]']:
original_text = str(text_node)
redacted_text = redact_text(original_text)
text_node.replace_with(redacted_text) # Replace the text in the soup object
# You might also want to check attributes for PII
# Example: check 'alt' attributes, 'title' attributes, etc.
for tag in soup.find_all(True): # Iterate through all tags
for attr, value in tag.attrs.items():
if isinstance(value, str):
redacted_attr_value = redact_text(value)
tag.attrs[attr] = redacted_attr_value
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
f.write(str(soup))
print(f"Redacted HTML saved to redacted_{file_path}")
# Example usage:
# process_json_file("your_data.json")
# process_html_file("your_webpage.html")