first commit

This commit is contained in:
Ireneusz Bachanowicz 2025-07-14 17:12:50 +02:00
commit d20cf39e4a
8 changed files with 10646 additions and 0 deletions

File diff suppressed because one or more lines are too long

3458
HUB_nohtml.txt Normal file

File diff suppressed because one or more lines are too long

154
anonymize_pii.py Normal file
View File

@ -0,0 +1,154 @@
import sys
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
def anonymize_file(input_path, output_path):
"""
Anonymize PII in a text file using Presidio
"""
try:
# Method 1: Using NlpEngineProvider (recommended)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
# Set max_length for handling large documents
if hasattr(nlp_engine, 'nlp') and 'en' in nlp_engine.nlp:
nlp_engine.nlp['en'].max_length = 3_000_000
# Initialize AnalyzerEngine
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
except Exception as e:
print(f"Error with NlpEngineProvider, trying alternative method: {str(e)}")
# Method 2: Direct SpacyNlpEngine initialization (fallback)
try:
# Load spaCy model directly first
try:
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 3_000_000
except OSError:
print("en_core_web_lg model not found. Trying en_core_web_sm...")
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 3_000_000
# Create SpacyNlpEngine with the loaded model
nlp_engine = SpacyNlpEngine(
models=[{
"lang_code": "en",
"model_name": "en_core_web_lg"
}]
)
# Override the model with our pre-configured one
nlp_engine.nlp = {"en": nlp}
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
except Exception as e2:
print(f"Error with fallback method: {str(e2)}")
print("Falling back to default analyzer...")
analyzer = AnalyzerEngine()
# Initialize anonymizer
anonymizer = AnonymizerEngine()
# Read input file
try:
with open(input_path, 'r', encoding='utf-8') as f:
text = f.read()
except Exception as e:
print(f"Error reading input file: {str(e)}")
return
# Check if text is too long and split if necessary
max_chunk_size = 1_000_000 # 1MB chunks
if len(text) > max_chunk_size:
print(f"Text is large ({len(text)} chars), processing in chunks...")
chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
anonymized_chunks = []
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
try:
results = analyzer.analyze(text=chunk, language='en')
anonymized_result = anonymizer.anonymize(text=chunk, analyzer_results=results)
anonymized_chunks.append(anonymized_result.text)
except Exception as e:
print(f"Error processing chunk {i+1}: {str(e)}")
anonymized_chunks.append(chunk) # Keep original if anonymization fails
final_text = ''.join(anonymized_chunks)
else:
# Process entire text at once
try:
print("Analyzing text for PII...")
results = analyzer.analyze(text=text, language='en')
print(f"Found {len(results)} PII entities")
print("Anonymizing text...")
anonymized_result = anonymizer.anonymize(text=text, analyzer_results=results)
final_text = anonymized_result.text
except Exception as e:
print(f"Error during analysis/anonymization: {str(e)}")
return
# Write output
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(final_text)
print(f"Anonymized content written to {output_path}")
except Exception as e:
print(f"Error writing output file: {str(e)}")
def check_requirements():
"""Check if required models and packages are installed"""
try:
import presidio_analyzer
import presidio_anonymizer
import spacy
print("✓ All required packages are installed")
# Check for spaCy models
try:
spacy.load("en_core_web_lg")
print("✓ en_core_web_lg model is available")
except OSError:
try:
spacy.load("en_core_web_sm")
print("⚠ Only en_core_web_sm model is available (en_core_web_lg recommended)")
except OSError:
print("✗ No English spaCy models found. Install with:")
print(" python -m spacy download en_core_web_lg")
return False
return True
except ImportError as e:
print(f"✗ Missing required package: {str(e)}")
print("Install with: pip install presidio-analyzer presidio-anonymizer spacy")
return False
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python anonymize_pii.py <input_file> <output_file>")
print("\nThis script anonymizes PII (Personally Identifiable Information) in text files.")
print("It detects and replaces entities like names, emails, phone numbers, etc.")
sys.exit(1)
if not check_requirements():
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
print(f"Processing: {input_file} -> {output_file}")
anonymize_file(input_file, output_file)

24
bs.py Normal file
View File

@ -0,0 +1,24 @@
import sys
from bs4 import BeautifulSoup
def remove_html_tags(input_file, output_file):
# Read the HTML content from the input file
with open(input_file, 'r', encoding='utf-8') as f:
html_content = f.read()
# Use BeautifulSoup to parse and extract text
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text()
# Write the plain text to the output file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(text)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python remove_html_tags.py <input_file> <output_file>")
else:
input_file = sys.argv[1]
output_file = sys.argv[2]
remove_html_tags(input_file, output_file)
print(f"HTML tags removed. Output written to {output_file}")

16
custom payload JIRA.json Normal file

File diff suppressed because one or more lines are too long

16
custom_output.json Normal file

File diff suppressed because one or more lines are too long

3458
output.txt Normal file

File diff suppressed because one or more lines are too long

62
strip.py Normal file
View File

@ -0,0 +1,62 @@
import json
from bs4 import BeautifulSoup
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
def redact_text(text_content):
# Analyze the text for PII
results = analyzer.analyze(text=text_content, language='en')
# Anonymize the detected PII
anonymized_result = anonymizer.anonymize(text=text_content, analyzer_results=results)
return anonymized_result.text
def process_json_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Recursive function to redact strings in JSON
def redact_json_values(obj):
if isinstance(obj, dict):
return {k: redact_json_values(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [redact_json_values(elem) for elem in obj]
elif isinstance(obj, str):
return redact_text(obj)
else:
return obj
redacted_data = redact_json_values(data)
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
json.dump(redacted_data, f, indent=2)
print(f"Redacted JSON saved to redacted_{file_path}")
def process_html_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# Find all text nodes (excluding script and style)
for text_node in soup.find_all(string=True):
if text_node.parent.name not in ['script', 'style', 'head', 'meta', '[document]']:
original_text = str(text_node)
redacted_text = redact_text(original_text)
text_node.replace_with(redacted_text) # Replace the text in the soup object
# You might also want to check attributes for PII
# Example: check 'alt' attributes, 'title' attributes, etc.
for tag in soup.find_all(True): # Iterate through all tags
for attr, value in tag.attrs.items():
if isinstance(value, str):
redacted_attr_value = redact_text(value)
tag.attrs[attr] = redacted_attr_value
with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
f.write(str(soup))
print(f"Redacted HTML saved to redacted_{file_path}")
# Example usage:
# process_json_file("your_data.json")
# process_html_file("your_webpage.html")