first commit

2025-07-14 17:12:50 +02:00 · 2025-07-14 17:12:50 +02:00 · d20cf39e4a
commit d20cf39e4a
8 changed files with 10646 additions and 0 deletions
--- a/HUB_dummy-data_test_clean.json
+++ b/HUB_dummy-data_test_clean.json
--- a/HUB_nohtml.txt
+++ b/HUB_nohtml.txt
--- a/anonymize_pii.py
+++ b/anonymize_pii.py
@ -0,0 +1,154 @@
 import sys
 import spacy
 from presidio_analyzer import AnalyzerEngine
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
 from presidio_anonymizer import AnonymizerEngine
 def anonymize_file(input_path, output_path):
    """
    Anonymize PII in a text file using Presidio
    """
    try:
        # Method 1: Using NlpEngineProvider (recommended)
        nlp_configuration = {
            "nlp_engine_name": "spacy",
            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
        }
        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
        # Set max_length for handling large documents
        if hasattr(nlp_engine, 'nlp') and 'en' in nlp_engine.nlp:
            nlp_engine.nlp['en'].max_length = 3_000_000
        # Initialize AnalyzerEngine
        analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
    except Exception as e:
        print(f"Error with NlpEngineProvider, trying alternative method: {str(e)}")
        # Method 2: Direct SpacyNlpEngine initialization (fallback)
        try:
            # Load spaCy model directly first
            try:
                nlp = spacy.load("en_core_web_lg")
                nlp.max_length = 3_000_000
            except OSError:
                print("en_core_web_lg model not found. Trying en_core_web_sm...")
                nlp = spacy.load("en_core_web_sm")
                nlp.max_length = 3_000_000
            # Create SpacyNlpEngine with the loaded model
            nlp_engine = SpacyNlpEngine(
                models=[{
                    "lang_code": "en", 
                    "model_name": "en_core_web_lg"
                }]
            )
            # Override the model with our pre-configured one
            nlp_engine.nlp = {"en": nlp}
            analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
        except Exception as e2:
            print(f"Error with fallback method: {str(e2)}")
            print("Falling back to default analyzer...")
            analyzer = AnalyzerEngine()
    # Initialize anonymizer
    anonymizer = AnonymizerEngine()
    # Read input file
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()
    except Exception as e:
        print(f"Error reading input file: {str(e)}")
        return
    # Check if text is too long and split if necessary
    max_chunk_size = 1_000_000  # 1MB chunks
    if len(text) > max_chunk_size:
        print(f"Text is large ({len(text)} chars), processing in chunks...")
        chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
        anonymized_chunks = []
        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)}...")
            try:
                results = analyzer.analyze(text=chunk, language='en')
                anonymized_result = anonymizer.anonymize(text=chunk, analyzer_results=results)
                anonymized_chunks.append(anonymized_result.text)
            except Exception as e:
                print(f"Error processing chunk {i+1}: {str(e)}")
                anonymized_chunks.append(chunk)  # Keep original if anonymization fails
        final_text = ''.join(anonymized_chunks)
    else:
        # Process entire text at once
        try:
            print("Analyzing text for PII...")
            results = analyzer.analyze(text=text, language='en')
            print(f"Found {len(results)} PII entities")
            print("Anonymizing text...")
            anonymized_result = anonymizer.anonymize(text=text, analyzer_results=results)
            final_text = anonymized_result.text
        except Exception as e:
            print(f"Error during analysis/anonymization: {str(e)}")
            return
    # Write output
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(final_text)
        print(f"Anonymized content written to {output_path}")
    except Exception as e:
        print(f"Error writing output file: {str(e)}")
 def check_requirements():
    """Check if required models and packages are installed"""
    try:
        import presidio_analyzer
        import presidio_anonymizer
        import spacy
        print("✓ All required packages are installed")
        # Check for spaCy models
        try:
            spacy.load("en_core_web_lg")
            print("✓ en_core_web_lg model is available")
        except OSError:
            try:
                spacy.load("en_core_web_sm")
                print("⚠ Only en_core_web_sm model is available (en_core_web_lg recommended)")
            except OSError:
                print("✗ No English spaCy models found. Install with:")
                print("  python -m spacy download en_core_web_lg")
                return False
        return True
    except ImportError as e:
        print(f"✗ Missing required package: {str(e)}")
        print("Install with: pip install presidio-analyzer presidio-anonymizer spacy")
        return False
 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python anonymize_pii.py <input_file> <output_file>")
        print("\nThis script anonymizes PII (Personally Identifiable Information) in text files.")
        print("It detects and replaces entities like names, emails, phone numbers, etc.")
        sys.exit(1)
    if not check_requirements():
        sys.exit(1)
    input_file = sys.argv[1]
    output_file = sys.argv[2]
    print(f"Processing: {input_file} -> {output_file}")
    anonymize_file(input_file, output_file)
--- a/bs.py
+++ b/bs.py
@ -0,0 +1,24 @@
 import sys
 from bs4 import BeautifulSoup
 def remove_html_tags(input_file, output_file):
    # Read the HTML content from the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        html_content = f.read()
    # Use BeautifulSoup to parse and extract text
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()
    # Write the plain text to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(text)
 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python remove_html_tags.py <input_file> <output_file>")
    else:
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        remove_html_tags(input_file, output_file)
        print(f"HTML tags removed. Output written to {output_file}")
--- a/JIRA.json
+++ b/JIRA.json
--- a/custom_output.json
+++ b/custom_output.json
--- a/output.txt
+++ b/output.txt
--- a/strip.py
+++ b/strip.py
@ -0,0 +1,62 @@
 import json
 from bs4 import BeautifulSoup
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
 analyzer = AnalyzerEngine()
 anonymizer = AnonymizerEngine()
 def redact_text(text_content):
    # Analyze the text for PII
    results = analyzer.analyze(text=text_content, language='en')
    # Anonymize the detected PII
    anonymized_result = anonymizer.anonymize(text=text_content, analyzer_results=results)
    return anonymized_result.text
 def process_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # Recursive function to redact strings in JSON
    def redact_json_values(obj):
        if isinstance(obj, dict):
            return {k: redact_json_values(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [redact_json_values(elem) for elem in obj]
        elif isinstance(obj, str):
            return redact_text(obj)
        else:
            return obj
    redacted_data = redact_json_values(data)
    with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
        json.dump(redacted_data, f, indent=2)
    print(f"Redacted JSON saved to redacted_{file_path}")
 def process_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    # Find all text nodes (excluding script and style)
    for text_node in soup.find_all(string=True):
        if text_node.parent.name not in ['script', 'style', 'head', 'meta', '[document]']:
            original_text = str(text_node)
            redacted_text = redact_text(original_text)
            text_node.replace_with(redacted_text) # Replace the text in the soup object
    # You might also want to check attributes for PII
    # Example: check 'alt' attributes, 'title' attributes, etc.
    for tag in soup.find_all(True): # Iterate through all tags
        for attr, value in tag.attrs.items():
            if isinstance(value, str):
                redacted_attr_value = redact_text(value)
                tag.attrs[attr] = redacted_attr_value
    with open(f"redacted_{file_path}", 'w', encoding='utf-8') as f:
        f.write(str(soup))
    print(f"Redacted HTML saved to redacted_{file_path}")
 # Example usage:
 # process_json_file("your_data.json")
 # process_html_file("your_webpage.html")