first commit
This commit is contained in:
commit
d20cf39e4a
3458
HUB_dummy-data_test_clean.json
Normal file
3458
HUB_dummy-data_test_clean.json
Normal file
File diff suppressed because one or more lines are too long
3458
HUB_nohtml.txt
Normal file
3458
HUB_nohtml.txt
Normal file
File diff suppressed because one or more lines are too long
154
anonymize_pii.py
Normal file
154
anonymize_pii.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
import sys
|
||||||
|
import spacy
|
||||||
|
from presidio_analyzer import AnalyzerEngine
|
||||||
|
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
|
||||||
|
from presidio_anonymizer import AnonymizerEngine
|
||||||
|
|
||||||
|
def anonymize_file(input_path, output_path):
|
||||||
|
"""
|
||||||
|
Anonymize PII in a text file using Presidio
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Method 1: Using NlpEngineProvider (recommended)
|
||||||
|
nlp_configuration = {
|
||||||
|
"nlp_engine_name": "spacy",
|
||||||
|
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
||||||
|
|
||||||
|
# Set max_length for handling large documents
|
||||||
|
if hasattr(nlp_engine, 'nlp') and 'en' in nlp_engine.nlp:
|
||||||
|
nlp_engine.nlp['en'].max_length = 3_000_000
|
||||||
|
|
||||||
|
# Initialize AnalyzerEngine
|
||||||
|
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error with NlpEngineProvider, trying alternative method: {str(e)}")
|
||||||
|
|
||||||
|
# Method 2: Direct SpacyNlpEngine initialization (fallback)
|
||||||
|
try:
|
||||||
|
# Load spaCy model directly first
|
||||||
|
try:
|
||||||
|
nlp = spacy.load("en_core_web_lg")
|
||||||
|
nlp.max_length = 3_000_000
|
||||||
|
except OSError:
|
||||||
|
print("en_core_web_lg model not found. Trying en_core_web_sm...")
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
nlp.max_length = 3_000_000
|
||||||
|
|
||||||
|
# Create SpacyNlpEngine with the loaded model
|
||||||
|
nlp_engine = SpacyNlpEngine(
|
||||||
|
models=[{
|
||||||
|
"lang_code": "en",
|
||||||
|
"model_name": "en_core_web_lg"
|
||||||
|
}]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Override the model with our pre-configured one
|
||||||
|
nlp_engine.nlp = {"en": nlp}
|
||||||
|
|
||||||
|
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
||||||
|
|
||||||
|
except Exception as e2:
|
||||||
|
print(f"Error with fallback method: {str(e2)}")
|
||||||
|
print("Falling back to default analyzer...")
|
||||||
|
analyzer = AnalyzerEngine()
|
||||||
|
|
||||||
|
# Initialize anonymizer
|
||||||
|
anonymizer = AnonymizerEngine()
|
||||||
|
|
||||||
|
# Read input file
|
||||||
|
try:
|
||||||
|
with open(input_path, 'r', encoding='utf-8') as f:
|
||||||
|
text = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading input file: {str(e)}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if text is too long and split if necessary
|
||||||
|
max_chunk_size = 1_000_000 # 1MB chunks
|
||||||
|
if len(text) > max_chunk_size:
|
||||||
|
print(f"Text is large ({len(text)} chars), processing in chunks...")
|
||||||
|
chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
||||||
|
anonymized_chunks = []
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
print(f"Processing chunk {i+1}/{len(chunks)}...")
|
||||||
|
try:
|
||||||
|
results = analyzer.analyze(text=chunk, language='en')
|
||||||
|
anonymized_result = anonymizer.anonymize(text=chunk, analyzer_results=results)
|
||||||
|
anonymized_chunks.append(anonymized_result.text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing chunk {i+1}: {str(e)}")
|
||||||
|
anonymized_chunks.append(chunk) # Keep original if anonymization fails
|
||||||
|
|
||||||
|
final_text = ''.join(anonymized_chunks)
|
||||||
|
else:
|
||||||
|
# Process entire text at once
|
||||||
|
try:
|
||||||
|
print("Analyzing text for PII...")
|
||||||
|
results = analyzer.analyze(text=text, language='en')
|
||||||
|
print(f"Found {len(results)} PII entities")
|
||||||
|
|
||||||
|
print("Anonymizing text...")
|
||||||
|
anonymized_result = anonymizer.anonymize(text=text, analyzer_results=results)
|
||||||
|
final_text = anonymized_result.text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during analysis/anonymization: {str(e)}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
try:
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(final_text)
|
||||||
|
print(f"Anonymized content written to {output_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error writing output file: {str(e)}")
|
||||||
|
|
||||||
|
def check_requirements():
|
||||||
|
"""Check if required models and packages are installed"""
|
||||||
|
try:
|
||||||
|
import presidio_analyzer
|
||||||
|
import presidio_anonymizer
|
||||||
|
import spacy
|
||||||
|
print("✓ All required packages are installed")
|
||||||
|
|
||||||
|
# Check for spaCy models
|
||||||
|
try:
|
||||||
|
spacy.load("en_core_web_lg")
|
||||||
|
print("✓ en_core_web_lg model is available")
|
||||||
|
except OSError:
|
||||||
|
try:
|
||||||
|
spacy.load("en_core_web_sm")
|
||||||
|
print("⚠ Only en_core_web_sm model is available (en_core_web_lg recommended)")
|
||||||
|
except OSError:
|
||||||
|
print("✗ No English spaCy models found. Install with:")
|
||||||
|
print(" python -m spacy download en_core_web_lg")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"✗ Missing required package: {str(e)}")
|
||||||
|
print("Install with: pip install presidio-analyzer presidio-anonymizer spacy")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python anonymize_pii.py <input_file> <output_file>")
|
||||||
|
print("\nThis script anonymizes PII (Personally Identifiable Information) in text files.")
|
||||||
|
print("It detects and replaces entities like names, emails, phone numbers, etc.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not check_requirements():
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
input_file = sys.argv[1]
|
||||||
|
output_file = sys.argv[2]
|
||||||
|
|
||||||
|
print(f"Processing: {input_file} -> {output_file}")
|
||||||
|
anonymize_file(input_file, output_file)
|
||||||
24
bs.py
Normal file
24
bs.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import sys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def remove_html_tags(input_file, output_file):
|
||||||
|
# Read the HTML content from the input file
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
html_content = f.read()
|
||||||
|
|
||||||
|
# Use BeautifulSoup to parse and extract text
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
text = soup.get_text()
|
||||||
|
|
||||||
|
# Write the plain text to the output file
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python remove_html_tags.py <input_file> <output_file>")
|
||||||
|
else:
|
||||||
|
input_file = sys.argv[1]
|
||||||
|
output_file = sys.argv[2]
|
||||||
|
remove_html_tags(input_file, output_file)
|
||||||
|
print(f"HTML tags removed. Output written to {output_file}")
|
||||||
16
custom payload JIRA.json
Normal file
16
custom payload JIRA.json
Normal file
File diff suppressed because one or more lines are too long
16
custom_output.json
Normal file
16
custom_output.json
Normal file
File diff suppressed because one or more lines are too long
3458
output.txt
Normal file
3458
output.txt
Normal file
File diff suppressed because one or more lines are too long
62
strip.py
Normal file
62
strip.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import json
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from presidio_analyzer import AnalyzerEngine
|
||||||
|
from presidio_anonymizer import AnonymizerEngine
|
||||||
|
|
||||||
|
analyzer = AnalyzerEngine()
|
||||||
|
anonymizer = AnonymizerEngine()
|
||||||
|
|
||||||
|
def redact_text(text_content):
|
||||||
|
# Analyze the text for PII
|
||||||
|
results = analyzer.analyze(text=text_content, language='en')
|
||||||
|
# Anonymize the detected PII
|
||||||
|
anonymized_result = anonymizer.anonymize(text=text_content, analyzer_results=results)
|
||||||
|
return anonymized_result.text
|
||||||
|
|
||||||
|
def process_json_file(file_path):
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Recursive function to redact strings in JSON
|
||||||
|
def redact_json_values(obj):
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {k: redact_json_values(v) for k, v in obj.items()}
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
return [redact_json_values(elem) for elem in obj]
|
||||||
|
elif isinstance(obj, str):
|
||||||
|
return redact_text(obj)
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
redacted_data = redact_json_values(data)
|
||||||
|
|
||||||
|
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(redacted_data, f, indent=2)
|
||||||
|
print(f"Redacted JSON saved to redacted_{file_path}")
|
||||||
|
|
||||||
|
def process_html_file(file_path):
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
soup = BeautifulSoup(f, 'html.parser')
|
||||||
|
|
||||||
|
# Find all text nodes (excluding script and style)
|
||||||
|
for text_node in soup.find_all(string=True):
|
||||||
|
if text_node.parent.name not in ['script', 'style', 'head', 'meta', '[document]']:
|
||||||
|
original_text = str(text_node)
|
||||||
|
redacted_text = redact_text(original_text)
|
||||||
|
text_node.replace_with(redacted_text) # Replace the text in the soup object
|
||||||
|
|
||||||
|
# You might also want to check attributes for PII
|
||||||
|
# Example: check 'alt' attributes, 'title' attributes, etc.
|
||||||
|
for tag in soup.find_all(True): # Iterate through all tags
|
||||||
|
for attr, value in tag.attrs.items():
|
||||||
|
if isinstance(value, str):
|
||||||
|
redacted_attr_value = redact_text(value)
|
||||||
|
tag.attrs[attr] = redacted_attr_value
|
||||||
|
|
||||||
|
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
|
||||||
|
f.write(str(soup))
|
||||||
|
print(f"Redacted HTML saved to redacted_{file_path}")
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
# process_json_file("your_data.json")
|
||||||
|
# process_html_file("your_webpage.html")
|
||||||
Loading…
x
Reference in New Issue
Block a user