anonymizer/strip.py

import json
from bs4 import BeautifulSoup
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

def redact_text(text_content):
    # Analyze the text for PII
    results = analyzer.analyze(text=text_content, language='en')
    # Anonymize the detected PII
    anonymized_result = anonymizer.anonymize(text=text_content, analyzer_results=results)
    return anonymized_result.text

def process_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Recursive function to redact strings in JSON
    def redact_json_values(obj):
        if isinstance(obj, dict):
            return {k: redact_json_values(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [redact_json_values(elem) for elem in obj]
        elif isinstance(obj, str):
            return redact_text(obj)
        else:
            return obj

    redacted_data = redact_json_values(data)

    with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
        json.dump(redacted_data, f, indent=2)
    print(f"Redacted JSON saved to redacted_{file_path}")

def process_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')

    # Find all text nodes (excluding script and style)
    for text_node in soup.find_all(string=True):
        if text_node.parent.name not in ['script', 'style', 'head', 'meta', '[document]']:
            original_text = str(text_node)
            redacted_text = redact_text(original_text)
            text_node.replace_with(redacted_text) # Replace the text in the soup object

    # You might also want to check attributes for PII
    # Example: check 'alt' attributes, 'title' attributes, etc.
    for tag in soup.find_all(True): # Iterate through all tags
        for attr, value in tag.attrs.items():
            if isinstance(value, str):
                redacted_attr_value = redact_text(value)
                tag.attrs[attr] = redacted_attr_value

    with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
        f.write(str(soup))
    print(f"Redacted HTML saved to redacted_{file_path}")

# Example usage:
# process_json_file("your_data.json")
# process_html_file("your_webpage.html")