62 lines
2.3 KiB
Python
62 lines
2.3 KiB
Python
import json
|
|
from bs4 import BeautifulSoup
|
|
from presidio_analyzer import AnalyzerEngine
|
|
from presidio_anonymizer import AnonymizerEngine
|
|
|
|
analyzer = AnalyzerEngine()
|
|
anonymizer = AnonymizerEngine()
|
|
|
|
def redact_text(text_content):
|
|
# Analyze the text for PII
|
|
results = analyzer.analyze(text=text_content, language='en')
|
|
# Anonymize the detected PII
|
|
anonymized_result = anonymizer.anonymize(text=text_content, analyzer_results=results)
|
|
return anonymized_result.text
|
|
|
|
def process_json_file(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Recursive function to redact strings in JSON
|
|
def redact_json_values(obj):
|
|
if isinstance(obj, dict):
|
|
return {k: redact_json_values(v) for k, v in obj.items()}
|
|
elif isinstance(obj, list):
|
|
return [redact_json_values(elem) for elem in obj]
|
|
elif isinstance(obj, str):
|
|
return redact_text(obj)
|
|
else:
|
|
return obj
|
|
|
|
redacted_data = redact_json_values(data)
|
|
|
|
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
|
|
json.dump(redacted_data, f, indent=2)
|
|
print(f"Redacted JSON saved to redacted_{file_path}")
|
|
|
|
def process_html_file(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
soup = BeautifulSoup(f, 'html.parser')
|
|
|
|
# Find all text nodes (excluding script and style)
|
|
for text_node in soup.find_all(string=True):
|
|
if text_node.parent.name not in ['script', 'style', 'head', 'meta', '[document]']:
|
|
original_text = str(text_node)
|
|
redacted_text = redact_text(original_text)
|
|
text_node.replace_with(redacted_text) # Replace the text in the soup object
|
|
|
|
# You might also want to check attributes for PII
|
|
# Example: check 'alt' attributes, 'title' attributes, etc.
|
|
for tag in soup.find_all(True): # Iterate through all tags
|
|
for attr, value in tag.attrs.items():
|
|
if isinstance(value, str):
|
|
redacted_attr_value = redact_text(value)
|
|
tag.attrs[attr] = redacted_attr_value
|
|
|
|
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
|
|
f.write(str(soup))
|
|
print(f"Redacted HTML saved to redacted_{file_path}")
|
|
|
|
# Example usage:
|
|
# process_json_file("your_data.json")
|
|
# process_html_file("your_webpage.html") |