import json from bs4 import BeautifulSoup from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() def redact_text(text_content): # Analyze the text for PII results = analyzer.analyze(text=text_content, language='en') # Anonymize the detected PII anonymized_result = anonymizer.anonymize(text=text_content, analyzer_results=results) return anonymized_result.text def process_json_file(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) # Recursive function to redact strings in JSON def redact_json_values(obj): if isinstance(obj, dict): return {k: redact_json_values(v) for k, v in obj.items()} elif isinstance(obj, list): return [redact_json_values(elem) for elem in obj] elif isinstance(obj, str): return redact_text(obj) else: return obj redacted_data = redact_json_values(data) with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f: json.dump(redacted_data, f, indent=2) print(f"Redacted JSON saved to redacted_{file_path}") def process_html_file(file_path): with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # Find all text nodes (excluding script and style) for text_node in soup.find_all(string=True): if text_node.parent.name not in ['script', 'style', 'head', 'meta', '[document]']: original_text = str(text_node) redacted_text = redact_text(original_text) text_node.replace_with(redacted_text) # Replace the text in the soup object # You might also want to check attributes for PII # Example: check 'alt' attributes, 'title' attributes, etc. for tag in soup.find_all(True): # Iterate through all tags for attr, value in tag.attrs.items(): if isinstance(value, str): redacted_attr_value = redact_text(value) tag.attrs[attr] = redacted_attr_value with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f: f.write(str(soup)) print(f"Redacted HTML saved to redacted_{file_path}") # Example usage: # process_json_file("your_data.json") # process_html_file("your_webpage.html")