anonymizer/anonymize_pii.py

import sys
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine

def anonymize_file(input_path, output_path):
    """
    Anonymize PII in a text file using Presidio
    """
    try:
        # Method 1: Using NlpEngineProvider (recommended)
        nlp_configuration = {
            "nlp_engine_name": "spacy",
            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
        }

        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

        # Set max_length for handling large documents
        if hasattr(nlp_engine, 'nlp') and 'en' in nlp_engine.nlp:
            nlp_engine.nlp['en'].max_length = 3_000_000

        # Initialize AnalyzerEngine
        analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])

    except Exception as e:
        print(f"Error with NlpEngineProvider, trying alternative method: {str(e)}")

        # Method 2: Direct SpacyNlpEngine initialization (fallback)
        try:
            # Load spaCy model directly first
            try:
                nlp = spacy.load("en_core_web_lg")
                nlp.max_length = 3_000_000
            except OSError:
                print("en_core_web_lg model not found. Trying en_core_web_sm...")
                nlp = spacy.load("en_core_web_sm")
                nlp.max_length = 3_000_000

            # Create SpacyNlpEngine with the loaded model
            nlp_engine = SpacyNlpEngine(
                models=[{
                    "lang_code": "en",
                    "model_name": "en_core_web_lg"
                }]
            )

            # Override the model with our pre-configured one
            nlp_engine.nlp = {"en": nlp}

            analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])

        except Exception as e2:
            print(f"Error with fallback method: {str(e2)}")
            print("Falling back to default analyzer...")
            analyzer = AnalyzerEngine()

    # Initialize anonymizer
    anonymizer = AnonymizerEngine()

    # Read input file
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()
    except Exception as e:
        print(f"Error reading input file: {str(e)}")
        return

    # Check if text is too long and split if necessary
    max_chunk_size = 1_000_000  # 1MB chunks
    if len(text) > max_chunk_size:
        print(f"Text is large ({len(text)} chars), processing in chunks...")
        chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
        anonymized_chunks = []

        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)}...")
            try:
                results = analyzer.analyze(text=chunk, language='en')
                anonymized_result = anonymizer.anonymize(text=chunk, analyzer_results=results)
                anonymized_chunks.append(anonymized_result.text)
            except Exception as e:
                print(f"Error processing chunk {i+1}: {str(e)}")
                anonymized_chunks.append(chunk)  # Keep original if anonymization fails

        final_text = ''.join(anonymized_chunks)
    else:
        # Process entire text at once
        try:
            print("Analyzing text for PII...")
            results = analyzer.analyze(text=text, language='en')
            print(f"Found {len(results)} PII entities")

            print("Anonymizing text...")
            anonymized_result = anonymizer.anonymize(text=text, analyzer_results=results)
            final_text = anonymized_result.text

        except Exception as e:
            print(f"Error during analysis/anonymization: {str(e)}")
            return

    # Write output
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(final_text)
        print(f"Anonymized content written to {output_path}")

    except Exception as e:
        print(f"Error writing output file: {str(e)}")

def check_requirements():
    """Check if required models and packages are installed"""
    try:
        import presidio_analyzer
        import presidio_anonymizer
        import spacy
        print("✓ All required packages are installed")

        # Check for spaCy models
        try:
            spacy.load("en_core_web_lg")
            print("✓ en_core_web_lg model is available")
        except OSError:
            try:
                spacy.load("en_core_web_sm")
                print("⚠ Only en_core_web_sm model is available (en_core_web_lg recommended)")
            except OSError:
                print("✗ No English spaCy models found. Install with:")
                print("  python -m spacy download en_core_web_lg")
                return False

        return True

    except ImportError as e:
        print(f"✗ Missing required package: {str(e)}")
        print("Install with: pip install presidio-analyzer presidio-anonymizer spacy")
        return False

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python anonymize_pii.py <input_file> <output_file>")
        print("\nThis script anonymizes PII (Personally Identifiable Information) in text files.")
        print("It detects and replaces entities like names, emails, phone numbers, etc.")
        sys.exit(1)

    if not check_requirements():
        sys.exit(1)

    input_file = sys.argv[1]
    output_file = sys.argv[2]

    print(f"Processing: {input_file} -> {output_file}")
    anonymize_file(input_file, output_file)