import sys import spacy from presidio_analyzer import AnalyzerEngine from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider from presidio_anonymizer import AnonymizerEngine def anonymize_file(input_path, output_path): """ Anonymize PII in a text file using Presidio """ try: # Method 1: Using NlpEngineProvider (recommended) nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}], } nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() # Set max_length for handling large documents if hasattr(nlp_engine, 'nlp') and 'en' in nlp_engine.nlp: nlp_engine.nlp['en'].max_length = 3_000_000 # Initialize AnalyzerEngine analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"]) except Exception as e: print(f"Error with NlpEngineProvider, trying alternative method: {str(e)}") # Method 2: Direct SpacyNlpEngine initialization (fallback) try: # Load spaCy model directly first try: nlp = spacy.load("en_core_web_lg") nlp.max_length = 3_000_000 except OSError: print("en_core_web_lg model not found. Trying en_core_web_sm...") nlp = spacy.load("en_core_web_sm") nlp.max_length = 3_000_000 # Create SpacyNlpEngine with the loaded model nlp_engine = SpacyNlpEngine( models=[{ "lang_code": "en", "model_name": "en_core_web_lg" }] ) # Override the model with our pre-configured one nlp_engine.nlp = {"en": nlp} analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"]) except Exception as e2: print(f"Error with fallback method: {str(e2)}") print("Falling back to default analyzer...") analyzer = AnalyzerEngine() # Initialize anonymizer anonymizer = AnonymizerEngine() # Read input file try: with open(input_path, 'r', encoding='utf-8') as f: text = f.read() except Exception as e: print(f"Error reading input file: {str(e)}") return # Check if text is too long and split if necessary max_chunk_size = 1_000_000 # 1MB chunks if len(text) > max_chunk_size: print(f"Text is large ({len(text)} chars), processing in chunks...") chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)] anonymized_chunks = [] for i, chunk in enumerate(chunks): print(f"Processing chunk {i+1}/{len(chunks)}...") try: results = analyzer.analyze(text=chunk, language='en') anonymized_result = anonymizer.anonymize(text=chunk, analyzer_results=results) anonymized_chunks.append(anonymized_result.text) except Exception as e: print(f"Error processing chunk {i+1}: {str(e)}") anonymized_chunks.append(chunk) # Keep original if anonymization fails final_text = ''.join(anonymized_chunks) else: # Process entire text at once try: print("Analyzing text for PII...") results = analyzer.analyze(text=text, language='en') print(f"Found {len(results)} PII entities") print("Anonymizing text...") anonymized_result = anonymizer.anonymize(text=text, analyzer_results=results) final_text = anonymized_result.text except Exception as e: print(f"Error during analysis/anonymization: {str(e)}") return # Write output try: with open(output_path, 'w', encoding='utf-8') as f: f.write(final_text) print(f"Anonymized content written to {output_path}") except Exception as e: print(f"Error writing output file: {str(e)}") def check_requirements(): """Check if required models and packages are installed""" try: import presidio_analyzer import presidio_anonymizer import spacy print("✓ All required packages are installed") # Check for spaCy models try: spacy.load("en_core_web_lg") print("✓ en_core_web_lg model is available") except OSError: try: spacy.load("en_core_web_sm") print("⚠ Only en_core_web_sm model is available (en_core_web_lg recommended)") except OSError: print("✗ No English spaCy models found. Install with:") print(" python -m spacy download en_core_web_lg") return False return True except ImportError as e: print(f"✗ Missing required package: {str(e)}") print("Install with: pip install presidio-analyzer presidio-anonymizer spacy") return False if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python anonymize_pii.py ") print("\nThis script anonymizes PII (Personally Identifiable Information) in text files.") print("It detects and replaces entities like names, emails, phone numbers, etc.") sys.exit(1) if not check_requirements(): sys.exit(1) input_file = sys.argv[1] output_file = sys.argv[2] print(f"Processing: {input_file} -> {output_file}") anonymize_file(input_file, output_file)