anonymizer/anonymize_pii.py
Ireneusz Bachanowicz d20cf39e4a first commit
2025-07-14 17:12:50 +02:00

154 lines
5.7 KiB
Python

import sys
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
def anonymize_file(input_path, output_path):
"""
Anonymize PII in a text file using Presidio
"""
try:
# Method 1: Using NlpEngineProvider (recommended)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
# Set max_length for handling large documents
if hasattr(nlp_engine, 'nlp') and 'en' in nlp_engine.nlp:
nlp_engine.nlp['en'].max_length = 3_000_000
# Initialize AnalyzerEngine
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
except Exception as e:
print(f"Error with NlpEngineProvider, trying alternative method: {str(e)}")
# Method 2: Direct SpacyNlpEngine initialization (fallback)
try:
# Load spaCy model directly first
try:
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 3_000_000
except OSError:
print("en_core_web_lg model not found. Trying en_core_web_sm...")
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 3_000_000
# Create SpacyNlpEngine with the loaded model
nlp_engine = SpacyNlpEngine(
models=[{
"lang_code": "en",
"model_name": "en_core_web_lg"
}]
)
# Override the model with our pre-configured one
nlp_engine.nlp = {"en": nlp}
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
except Exception as e2:
print(f"Error with fallback method: {str(e2)}")
print("Falling back to default analyzer...")
analyzer = AnalyzerEngine()
# Initialize anonymizer
anonymizer = AnonymizerEngine()
# Read input file
try:
with open(input_path, 'r', encoding='utf-8') as f:
text = f.read()
except Exception as e:
print(f"Error reading input file: {str(e)}")
return
# Check if text is too long and split if necessary
max_chunk_size = 1_000_000 # 1MB chunks
if len(text) > max_chunk_size:
print(f"Text is large ({len(text)} chars), processing in chunks...")
chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
anonymized_chunks = []
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
try:
results = analyzer.analyze(text=chunk, language='en')
anonymized_result = anonymizer.anonymize(text=chunk, analyzer_results=results)
anonymized_chunks.append(anonymized_result.text)
except Exception as e:
print(f"Error processing chunk {i+1}: {str(e)}")
anonymized_chunks.append(chunk) # Keep original if anonymization fails
final_text = ''.join(anonymized_chunks)
else:
# Process entire text at once
try:
print("Analyzing text for PII...")
results = analyzer.analyze(text=text, language='en')
print(f"Found {len(results)} PII entities")
print("Anonymizing text...")
anonymized_result = anonymizer.anonymize(text=text, analyzer_results=results)
final_text = anonymized_result.text
except Exception as e:
print(f"Error during analysis/anonymization: {str(e)}")
return
# Write output
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(final_text)
print(f"Anonymized content written to {output_path}")
except Exception as e:
print(f"Error writing output file: {str(e)}")
def check_requirements():
"""Check if required models and packages are installed"""
try:
import presidio_analyzer
import presidio_anonymizer
import spacy
print("✓ All required packages are installed")
# Check for spaCy models
try:
spacy.load("en_core_web_lg")
print("✓ en_core_web_lg model is available")
except OSError:
try:
spacy.load("en_core_web_sm")
print("⚠ Only en_core_web_sm model is available (en_core_web_lg recommended)")
except OSError:
print("✗ No English spaCy models found. Install with:")
print(" python -m spacy download en_core_web_lg")
return False
return True
except ImportError as e:
print(f"✗ Missing required package: {str(e)}")
print("Install with: pip install presidio-analyzer presidio-anonymizer spacy")
return False
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python anonymize_pii.py <input_file> <output_file>")
print("\nThis script anonymizes PII (Personally Identifiable Information) in text files.")
print("It detects and replaces entities like names, emails, phone numbers, etc.")
sys.exit(1)
if not check_requirements():
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
print(f"Processing: {input_file} -> {output_file}")
anonymize_file(input_file, output_file)