408 lines
14 KiB
Python
Executable File
408 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import sys
|
|
import os
|
|
import argparse
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
import uuid
|
|
from typing import Optional, Any, Dict
|
|
import time
|
|
|
|
from dotenv import load_dotenv
|
|
import pymongo
|
|
from pdfminer.high_level import extract_text
|
|
|
|
from openrouter_client import initialize_openrouter_client, OpenRouterError, OpenRouterResponse
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Configuration
|
|
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
if not OPENROUTER_API_KEY:
|
|
# Use logger here if possible, but it might not be configured yet.
|
|
# Consider raising the error later or logging after basicConfig.
|
|
print("ERROR: OPENROUTER_API_KEY environment variable is required", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
OPENROUTER_MODEL_NAME = os.getenv("OPENROUTER_MODEL_NAME")
|
|
if not OPENROUTER_MODEL_NAME:
|
|
print("ERROR: OPENROUTER_MODEL_NAME environment variable is required", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
MAX_TOKENS = int(os.getenv("MAX_TOKENS", 500))
|
|
USE_MOCKUP = os.getenv("USE_MOCKUP", "false").lower() == "true"
|
|
MOCKUP_FILE_PATH = os.getenv("MOCKUP_FILE_PATH")
|
|
MONGODB_URI = os.getenv("MONGODB_URI")
|
|
MONGODB_DATABASE = os.getenv("MONGODB_DATABASE")
|
|
|
|
MONGO_COLLECTION_NAME = "cv_processing_collection"
|
|
|
|
# Logging setup
|
|
LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG").upper()
|
|
|
|
logging.basicConfig(
|
|
level=LOG_LEVEL,
|
|
format="[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s",
|
|
datefmt="%Y-%m-%dT%H:%M:%S%z",
|
|
)
|
|
logger = logging.getLogger(__name__) # Define logger earlier
|
|
|
|
# Global variable to hold the client instance
|
|
_opernrouter_client_instance = None
|
|
|
|
def get_opernrouter_client():
|
|
"""
|
|
Initializes and returns the OpenRouter client instance (lazy initialization).
|
|
Ensures the client is initialized only once.
|
|
"""
|
|
global _opernrouter_client_instance
|
|
if _opernrouter_client_instance is None:
|
|
logger.info("Initializing OpenRouter client for the first time...")
|
|
logger.debug(f"Using model: {OPENROUTER_MODEL_NAME}")
|
|
logger.debug("API Key present and valid format: %s", bool(OPENROUTER_API_KEY and OPENROUTER_API_KEY.startswith("sk-or-v1-")))
|
|
try:
|
|
_opernrouter_client_instance = initialize_openrouter_client(
|
|
api_key=OPENROUTER_API_KEY,
|
|
model_name=OPENROUTER_MODEL_NAME
|
|
)
|
|
logger.info(f"Successfully initialized OpenRouter client with model: {OPENROUTER_MODEL_NAME}")
|
|
except ValueError as e:
|
|
logger.error(f"Configuration error during client initialization: {e}")
|
|
# Re-raise or handle appropriately, maybe return None or raise specific error
|
|
raise # Re-raise the ValueError to be caught higher up if needed
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize OpenRouter client: {e}", exc_info=True)
|
|
# Re-raise or handle appropriately
|
|
raise # Re-raise the exception
|
|
else:
|
|
logger.debug("Returning existing OpenRouter client instance.")
|
|
return _opernrouter_client_instance
|
|
|
|
|
|
def get_mongo_collection():
|
|
"""Initialize and return MongoDB collection."""
|
|
# Consider lazy initialization for MongoDB as well if beneficial
|
|
mongo_client = pymongo.MongoClient(MONGODB_URI)
|
|
db = mongo_client[MONGODB_DATABASE]
|
|
return db[MONGO_COLLECTION_NAME]
|
|
|
|
|
|
def parse_arguments():
|
|
"""Parses command line arguments."""
|
|
parser = argparse.ArgumentParser(
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
description="""This tool analyzes resumes using the OpenRouter API. Parameters are required to run the analysis.
|
|
|
|
Required Environment Variables:
|
|
- OPENROUTER_API_KEY: Your OpenRouter API key
|
|
- OPENROUTER_MODEL_NAME: OpenRouter model to use (e.g. google/gemma-7b-it)
|
|
- MONGODB_URI: MongoDB connection string (optional for mockup mode)
|
|
- MAX_TOKENS: Maximum tokens for response (default: 500)""",
|
|
usage="resume_analysis.py [-h] [-f FILE] [-m]",
|
|
epilog="""Examples:
|
|
Analyze a resume: resume_analysis.py -f my_resume.pdf
|
|
Test with mockup data: resume_analysis.py -f test.pdf -m
|
|
|
|
Note: Make sure your OpenRouter API key and model name are properly configured in the .env file.""",
|
|
)
|
|
parser.add_argument(
|
|
"-f", "--file", help="Path to the resume file to analyze (PDF or text)"
|
|
)
|
|
parser.add_argument(
|
|
"-m", "--mockup", action="store_true", help="Use mockup response instead of calling LLM API"
|
|
)
|
|
if len(sys.argv) == 1:
|
|
parser.print_help()
|
|
return None
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_resume_text(args):
|
|
"""Loads resume text from a file or uses mockup text."""
|
|
use_mockup = args.mockup
|
|
if use_mockup:
|
|
resume_text = "Mockup resume text"
|
|
else:
|
|
if not os.path.exists(args.file):
|
|
logger.error(f"File not found: {args.file}")
|
|
sys.exit(1)
|
|
|
|
start_file_read_time = time.time()
|
|
if args.file.lower().endswith(".pdf"):
|
|
logger.debug(f"Using pdfminer to extract text from PDF: {args.file}")
|
|
resume_text = extract_text(args.file)
|
|
else:
|
|
with open(
|
|
args.file, "r", encoding="utf-8"
|
|
) as f: # Explicitly specify utf-8 encoding for text files
|
|
resume_text = f.read()
|
|
file_read_time = time.time() - start_file_read_time
|
|
logger.debug(f"File read time: {file_read_time:.2f} seconds")
|
|
return resume_text
|
|
|
|
|
|
def analyze_resume_with_llm(resume_text, use_mockup):
|
|
"""Analyzes resume text using OpenRouter API."""
|
|
start_time = time.time()
|
|
response = call_llm_api(resume_text, use_mockup)
|
|
llm_api_time = time.time() - start_time
|
|
logger.debug(f"LLM API call time: {llm_api_time:.2f} seconds")
|
|
return response
|
|
|
|
|
|
def store_llm_response(response, use_mockup, input_file_path):
|
|
"""Writes raw LLM response to a file."""
|
|
write_llm_response(response, use_mockup, input_file_path)
|
|
|
|
|
|
def save_processing_data(resume_text, summary, response, args, processing_id, use_mockup, cv_collection):
|
|
"""Saves processing data to MongoDB."""
|
|
insert_processing_data(
|
|
resume_text,
|
|
summary,
|
|
response,
|
|
args,
|
|
processing_id,
|
|
use_mockup,
|
|
cv_collection,
|
|
)
|
|
|
|
|
|
def get_cv_summary_from_response(response):
|
|
"""Extracts CV summary from LLM response."""
|
|
if response and hasattr(response, "choices"):
|
|
message_content = response.choices[0]['message']['content']
|
|
try:
|
|
summary = json.loads(message_content)
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse LLM response: {e}")
|
|
summary = {"error": "Invalid JSON response from LLM"}
|
|
else:
|
|
summary = {"error": "No response from LLM"}
|
|
return summary
|
|
|
|
|
|
def main():
|
|
"""Main function to process the resume."""
|
|
args = parse_arguments()
|
|
if args is None:
|
|
return
|
|
use_mockup = args.mockup # Ustal, czy używać makiety na podstawie flagi -m
|
|
|
|
try:
|
|
resume_text = load_resume_text(args)
|
|
except FileNotFoundError as e:
|
|
logger.error(f"File error: {e}")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.error(f"Error loading resume text: {e}")
|
|
sys.exit(1)
|
|
|
|
response = analyze_resume_with_llm(resume_text, use_mockup)
|
|
store_llm_response(response, use_mockup, args.file)
|
|
|
|
cv_collection = get_mongo_collection()
|
|
processing_id = str(uuid.uuid4())
|
|
summary = get_cv_summary_from_response(response)
|
|
save_processing_data(resume_text, summary, response, args, processing_id, use_mockup, cv_collection)
|
|
|
|
logger.info(f"Resume analysis completed. Processing ID: {processing_id}")
|
|
|
|
|
|
def load_mockup_response(mockup_file_path: str) -> dict:
|
|
"""Load mockup response from a JSON file."""
|
|
logger.debug(f"Loading mockup response from: {mockup_file_path}")
|
|
if not os.path.exists(mockup_file_path):
|
|
raise FileNotFoundError(f"Mockup file not found at: {mockup_file_path}")
|
|
with open(mockup_file_path, "r") as f:
|
|
response = json.load(f)
|
|
response.setdefault(
|
|
"llm_stats", {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
|
|
)
|
|
return response
|
|
|
|
|
|
def call_llm_api(text: str, use_mockup: bool) -> Optional[OpenRouterResponse]:
|
|
"""Call OpenRouter API to analyze resume text."""
|
|
if use_mockup:
|
|
logger.debug("Using mockup response.")
|
|
return load_mockup_response(MOCKUP_FILE_PATH)
|
|
|
|
prompt_path = os.path.join(os.path.dirname(__file__), "prompt.txt")
|
|
logger.debug(f"Loading system prompt from: {prompt_path}")
|
|
|
|
try:
|
|
# Load system prompt
|
|
if not os.path.exists(prompt_path):
|
|
raise FileNotFoundError(f"System prompt file not found: {prompt_path}")
|
|
|
|
with open(prompt_path, "r") as prompt_file:
|
|
system_content = prompt_file.read()
|
|
|
|
if not system_content.strip():
|
|
raise ValueError("System prompt file is empty")
|
|
|
|
# Prepare messages
|
|
messages = [
|
|
{"role": "system", "content": system_content},
|
|
{"role": "user", "content": text}
|
|
]
|
|
|
|
logger.debug("Prepared messages for API call:")
|
|
logger.debug(f"System message length: {len(system_content)} chars")
|
|
logger.debug(f"User message length: {len(text)} chars")
|
|
|
|
# Call OpenRouter API
|
|
logger.info(f"Calling OpenRouter API with model: {OPENROUTER_MODEL_NAME}")
|
|
logger.debug(f"Max tokens set to: {MAX_TOKENS}")
|
|
|
|
# Get the client instance (initializes on first call)
|
|
try:
|
|
client = get_opernrouter_client()
|
|
except Exception as e:
|
|
logger.error(f"Failed to get OpenRouter client: {e}")
|
|
return None # Cannot proceed without a client
|
|
|
|
response = client.create_chat_completion(
|
|
messages=messages,
|
|
max_tokens=MAX_TOKENS
|
|
)
|
|
|
|
# Validate response
|
|
if not response.choices:
|
|
logger.warning("API response contains no choices")
|
|
return None
|
|
|
|
# Log response details
|
|
logger.info("Successfully received API response")
|
|
logger.debug(f"Response model: {response.model}")
|
|
logger.debug(f"Token usage: {response.usage}")
|
|
logger.debug(f"Number of choices: {len(response.choices)}")
|
|
|
|
return response
|
|
|
|
except FileNotFoundError as e:
|
|
logger.error(f"File error: {e}")
|
|
return None
|
|
except OpenRouterError as e:
|
|
logger.error(f"OpenRouter API error: {e}", exc_info=True)
|
|
if hasattr(e, 'response'):
|
|
logger.error(f"Error response: {e.response}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during API call: {e}", exc_info=True)
|
|
return None
|
|
|
|
|
|
def write_llm_response(
|
|
response: Optional[OpenRouterResponse], use_mockup: bool, input_file_path: str = None
|
|
) -> None:
|
|
"""Write raw LLM response to a file."""
|
|
if use_mockup:
|
|
logger.debug("Using mockup response; no LLM message to write.")
|
|
return
|
|
|
|
if response is None:
|
|
logger.warning("No response to write")
|
|
return
|
|
|
|
if not response.choices:
|
|
logger.warning("No choices in LLM response")
|
|
logger.debug(f"Response object: {response.raw_response}")
|
|
return
|
|
|
|
try:
|
|
# Get output directory and base filename
|
|
output_dir = os.path.dirname(input_file_path) if input_file_path else "."
|
|
base_filename = (
|
|
os.path.splitext(os.path.basename(input_file_path))[0]
|
|
if input_file_path
|
|
else "default"
|
|
)
|
|
|
|
# Generate unique file path
|
|
processing_id = str(uuid.uuid4())
|
|
now = datetime.now()
|
|
timestamp_str = now.strftime("%Y%m%d_%H%M%S")
|
|
file_path = os.path.join(
|
|
output_dir, f"{base_filename}_llm_response_{timestamp_str}_{processing_id}"
|
|
) + ".json"
|
|
|
|
# Prepare serializable response
|
|
serializable_response = {
|
|
"choices": response.choices,
|
|
"usage": response.usage,
|
|
"model": response.model,
|
|
"raw_response": response.raw_response
|
|
}
|
|
|
|
# Write response to file
|
|
with open(file_path, "w") as f:
|
|
json.dump(serializable_response, f, indent=2)
|
|
logger.debug(f"LLM response written to {file_path}")
|
|
|
|
except IOError as e:
|
|
logger.error(f"Failed to write LLM response to file: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error while writing response: {e}", exc_info=True)
|
|
|
|
|
|
|
|
def insert_processing_data(
|
|
text_content: str,
|
|
summary: dict,
|
|
response: Optional[OpenRouterResponse],
|
|
args: argparse.Namespace,
|
|
processing_id: str,
|
|
use_mockup: bool,
|
|
cv_collection,
|
|
) -> None:
|
|
"""Insert processing data into MongoDB."""
|
|
if use_mockup:
|
|
logger.debug("Using mockup; skipping MongoDB insertion.")
|
|
return
|
|
|
|
logger.debug("Preparing processing data for MongoDB insertion.")
|
|
|
|
# Initialize default values
|
|
usage_data = {
|
|
"input_tokens": 0,
|
|
"output_tokens": 0,
|
|
"total_tokens": 0
|
|
}
|
|
|
|
# Extract usage data if available
|
|
if response and response.usage:
|
|
usage_data = {
|
|
"input_tokens": response.usage.get("prompt_tokens", 0),
|
|
"output_tokens": response.usage.get("completion_tokens", 0),
|
|
"total_tokens": response.usage.get("total_tokens", 0)
|
|
}
|
|
|
|
# Prepare processing data
|
|
processing_data = {
|
|
"processing_id": processing_id,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"text_content": text_content,
|
|
"summary": summary,
|
|
"model": response.model if response else None,
|
|
**usage_data,
|
|
"raw_response": response.raw_response if response else None
|
|
}
|
|
|
|
# Insert into MongoDB
|
|
try:
|
|
cv_collection.insert_one(processing_data)
|
|
logger.debug(f"Successfully inserted processing data for ID: {processing_id}")
|
|
logger.debug(f"Token usage - Input: {usage_data['input_tokens']}, "
|
|
f"Output: {usage_data['output_tokens']}, "
|
|
f"Total: {usage_data['total_tokens']}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to insert processing data into MongoDB: {e}", exc_info=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|