154 lines
5.7 KiB
Python
154 lines
5.7 KiB
Python
import sys
|
|
import spacy
|
|
from presidio_analyzer import AnalyzerEngine
|
|
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
|
|
from presidio_anonymizer import AnonymizerEngine
|
|
|
|
def anonymize_file(input_path, output_path):
|
|
"""
|
|
Anonymize PII in a text file using Presidio
|
|
"""
|
|
try:
|
|
# Method 1: Using NlpEngineProvider (recommended)
|
|
nlp_configuration = {
|
|
"nlp_engine_name": "spacy",
|
|
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
|
|
}
|
|
|
|
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
|
|
|
# Set max_length for handling large documents
|
|
if hasattr(nlp_engine, 'nlp') and 'en' in nlp_engine.nlp:
|
|
nlp_engine.nlp['en'].max_length = 3_000_000
|
|
|
|
# Initialize AnalyzerEngine
|
|
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
|
|
|
except Exception as e:
|
|
print(f"Error with NlpEngineProvider, trying alternative method: {str(e)}")
|
|
|
|
# Method 2: Direct SpacyNlpEngine initialization (fallback)
|
|
try:
|
|
# Load spaCy model directly first
|
|
try:
|
|
nlp = spacy.load("en_core_web_lg")
|
|
nlp.max_length = 3_000_000
|
|
except OSError:
|
|
print("en_core_web_lg model not found. Trying en_core_web_sm...")
|
|
nlp = spacy.load("en_core_web_sm")
|
|
nlp.max_length = 3_000_000
|
|
|
|
# Create SpacyNlpEngine with the loaded model
|
|
nlp_engine = SpacyNlpEngine(
|
|
models=[{
|
|
"lang_code": "en",
|
|
"model_name": "en_core_web_lg"
|
|
}]
|
|
)
|
|
|
|
# Override the model with our pre-configured one
|
|
nlp_engine.nlp = {"en": nlp}
|
|
|
|
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
|
|
|
except Exception as e2:
|
|
print(f"Error with fallback method: {str(e2)}")
|
|
print("Falling back to default analyzer...")
|
|
analyzer = AnalyzerEngine()
|
|
|
|
# Initialize anonymizer
|
|
anonymizer = AnonymizerEngine()
|
|
|
|
# Read input file
|
|
try:
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
text = f.read()
|
|
except Exception as e:
|
|
print(f"Error reading input file: {str(e)}")
|
|
return
|
|
|
|
# Check if text is too long and split if necessary
|
|
max_chunk_size = 1_000_000 # 1MB chunks
|
|
if len(text) > max_chunk_size:
|
|
print(f"Text is large ({len(text)} chars), processing in chunks...")
|
|
chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
|
anonymized_chunks = []
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
print(f"Processing chunk {i+1}/{len(chunks)}...")
|
|
try:
|
|
results = analyzer.analyze(text=chunk, language='en')
|
|
anonymized_result = anonymizer.anonymize(text=chunk, analyzer_results=results)
|
|
anonymized_chunks.append(anonymized_result.text)
|
|
except Exception as e:
|
|
print(f"Error processing chunk {i+1}: {str(e)}")
|
|
anonymized_chunks.append(chunk) # Keep original if anonymization fails
|
|
|
|
final_text = ''.join(anonymized_chunks)
|
|
else:
|
|
# Process entire text at once
|
|
try:
|
|
print("Analyzing text for PII...")
|
|
results = analyzer.analyze(text=text, language='en')
|
|
print(f"Found {len(results)} PII entities")
|
|
|
|
print("Anonymizing text...")
|
|
anonymized_result = anonymizer.anonymize(text=text, analyzer_results=results)
|
|
final_text = anonymized_result.text
|
|
|
|
except Exception as e:
|
|
print(f"Error during analysis/anonymization: {str(e)}")
|
|
return
|
|
|
|
# Write output
|
|
try:
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(final_text)
|
|
print(f"Anonymized content written to {output_path}")
|
|
|
|
except Exception as e:
|
|
print(f"Error writing output file: {str(e)}")
|
|
|
|
def check_requirements():
|
|
"""Check if required models and packages are installed"""
|
|
try:
|
|
import presidio_analyzer
|
|
import presidio_anonymizer
|
|
import spacy
|
|
print("✓ All required packages are installed")
|
|
|
|
# Check for spaCy models
|
|
try:
|
|
spacy.load("en_core_web_lg")
|
|
print("✓ en_core_web_lg model is available")
|
|
except OSError:
|
|
try:
|
|
spacy.load("en_core_web_sm")
|
|
print("⚠ Only en_core_web_sm model is available (en_core_web_lg recommended)")
|
|
except OSError:
|
|
print("✗ No English spaCy models found. Install with:")
|
|
print(" python -m spacy download en_core_web_lg")
|
|
return False
|
|
|
|
return True
|
|
|
|
except ImportError as e:
|
|
print(f"✗ Missing required package: {str(e)}")
|
|
print("Install with: pip install presidio-analyzer presidio-anonymizer spacy")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 3:
|
|
print("Usage: python anonymize_pii.py <input_file> <output_file>")
|
|
print("\nThis script anonymizes PII (Personally Identifiable Information) in text files.")
|
|
print("It detects and replaces entities like names, emails, phone numbers, etc.")
|
|
sys.exit(1)
|
|
|
|
if not check_requirements():
|
|
sys.exit(1)
|
|
|
|
input_file = sys.argv[1]
|
|
output_file = sys.argv[2]
|
|
|
|
print(f"Processing: {input_file} -> {output_file}")
|
|
anonymize_file(input_file, output_file) |