CV/my-app/utils/resume_analysis.py

276 lines
9.5 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import os
import argparse
import json
import logging
from datetime import datetime, timezone
import uuid
from typing import Optional, Any
import time
from dotenv import load_dotenv
import pymongo
import openai
from pdfminer.high_level import extract_text
# Load environment variables
load_dotenv()
# Configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL_NAME = os.getenv("MODEL_NAME")
MAX_TOKENS = int(os.getenv("MAX_TOKENS", 500))
USE_MOCKUP = os.getenv("USE_MOCKUP", "false").lower() == "true"
MOCKUP_FILE_PATH = os.getenv("MOCKUP_FILE_PATH")
MONGODB_URI = os.getenv("MONGODB_URI")
MONGODB_DATABASE = os.getenv("MONGODB_DATABASE")
MONGO_COLLECTION_NAME = "cv_processing_collection"
# Initialize OpenAI client
openai.api_key = OPENAI_API_KEY
# Logging setup
LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG").upper()
logging.basicConfig(
level=LOG_LEVEL,
format="[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S%z",
)
def get_mongo_collection():
"""Initialize and return MongoDB collection."""
mongo_client = pymongo.MongoClient(MONGODB_URI)
db = mongo_client[MONGODB_DATABASE]
return db[MONGO_COLLECTION_NAME]
logger = logging.getLogger(__name__)
def main():
"""Main function to process the resume."""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""This tool analyzes resumes using OpenAI's API. Parameters are required to run the analysis.
Required Environment Variables:
- OPENAI_API_KEY: Your OpenAI API key
- MODEL_NAME: OpenAI model to use (e.g. gpt-3.5-turbo)
- MONGODB_URI: MongoDB connection string (optional for mockup mode)""",
usage="resume_analysis.py [-h] [-f FILE] [-m]",
epilog="""Examples:
Analyze a resume: resume_analysis.py -f my_resume.pdf
Test with mockup data: resume_analysis.py -f test.pdf -m""",
)
parser.add_argument(
"-f", "--file", help="Path to the resume file to analyze (PDF or text)"
)
parser.add_argument(
"-m", "--mockup", action="store_true", help="Use mockup response instead of calling OpenAI API"
)
# If no arguments provided, show help and exit
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
# Determine whether to use mockup based on the -m flag, overriding USE_MOCKUP
use_mockup = args.mockup
# Load the resume text from the provided file or use mockup
if use_mockup:
resume_text = "Mockup resume text"
else:
if not os.path.exists(args.file):
logger.error(f"File not found: {args.file}")
sys.exit(1)
start_file_read_time = time.time()
if args.file.lower().endswith(".pdf"):
logger.debug(f"Using pdfminer to extract text from PDF: {args.file}")
resume_text = extract_text(args.file)
else:
with open(
args.file, "r", encoding="utf-8"
) as f: # Explicitly specify utf-8 encoding for text files
resume_text = f.read()
file_read_time = time.time() - start_file_read_time
logger.debug(f"File read time: {file_read_time:.2f} seconds")
# Call the OpenAI API with the resume text
start_time = time.time()
response = call_openai_api(resume_text, use_mockup)
openai_api_time = time.time() - start_time
logger.debug(f"OpenAI API call time: {openai_api_time:.2f} seconds")
# Initialize MongoDB collection only when needed
cv_collection = get_mongo_collection()
# Measure MongoDB insertion time
start_mongo_time = time.time()
if response and response.choices:
message_content = response.choices[0].message.content
try:
summary = json.loads(message_content)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse OpenAI response: {e}")
summary = {"error": "Invalid JSON response from OpenAI"}
else:
summary = {"error": "No response from OpenAI"}
insert_processing_data(
resume_text,
summary,
response,
args,
str(uuid.uuid4()),
use_mockup,
cv_collection,
)
mongo_insert_time = time.time() - start_mongo_time
logger.debug(f"MongoDB insert time: {mongo_insert_time:.2f} seconds")
write_openai_response(response, use_mockup, args.file)
def load_mockup_response(mockup_file_path: str) -> dict:
"""Load mockup response from a JSON file."""
logger.debug(f"Loading mockup response from: {mockup_file_path}")
if not os.path.exists(mockup_file_path):
raise FileNotFoundError(f"Mockup file not found at: {mockup_file_path}")
with open(mockup_file_path, "r") as f:
response = json.load(f)
response.setdefault(
"openai_stats", {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
)
return response
def call_openai_api(text: str, use_mockup: bool) -> Optional[Any]:
"""Call OpenAI API to analyze resume text."""
logger.debug("Calling OpenAI API.")
try:
if use_mockup:
return load_mockup_response(MOCKUP_FILE_PATH)
with open(os.path.join(os.path.dirname(__file__), "prompt.txt"), "r") as prompt_file:
system_content = prompt_file.read()
response = openai.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": system_content},
{"role": "user", "content": text},
],
max_tokens=MAX_TOKENS,
)
logger.debug(f"OpenAI API response: {response}")
return response
except Exception as e:
logger.error(f"Error during OpenAI API call: {e}", exc_info=True)
return None
def write_openai_response(
response: Any, use_mockup: bool, input_file_path: str = None
) -> None:
"""Write raw OpenAI response to a file."""
if use_mockup:
logger.debug("Using mockup response; no OpenAI message to write.")
return
if response and response.choices: # Changed from hasattr to direct attribute access
message_content = response.choices[0].message.content
logger.debug(f"Raw OpenAI message content: {message_content}")
output_dir = os.path.dirname(input_file_path) if input_file_path else "."
base_filename = (
os.path.splitext(os.path.basename(input_file_path))[0]
if input_file_path
else "default"
)
processing_id = str(uuid.uuid4())
file_path = os.path.join(
output_dir, f"{base_filename}_openai_response_{processing_id}"
) + ".json"
try:
serializable_response = { # Create a serializable dictionary
"choices": [
{
"message": {
"content": choice.message.content,
"role": choice.message.role,
},
"finish_reason": choice.finish_reason,
"index": choice.index,
}
for choice in response.choices
],
"openai_stats": {
"input_tokens": response.usage.prompt_tokens,
"output_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens,
},
"model": response.model,
}
with open(file_path, "w") as f:
json.dump(serializable_response, f, indent=2) # Dump the serializable dictionary
logger.debug(f"OpenAI response written to {file_path}")
except IOError as e:
logger.error(f"Failed to write OpenAI response to file: {e}")
else:
logger.warning("No choices in OpenAI response to extract message from.")
logger.debug(f"Response object: {response}")
def insert_processing_data(
text_content: str,
summary: dict,
response: Any,
args: argparse.Namespace,
processing_id: str,
use_mockup: bool,
cv_collection,
) -> None:
"""Insert processing data into MongoDB."""
logger.debug("Inserting processing data into MongoDB.")
if not use_mockup:
if response and response.choices:
message_content = response.choices[0].message.content
openai_stats = summary.get("openai_stats", {})
usage = response.usage
input_tokens = usage.prompt_tokens
output_tokens = usage.completion_tokens
total_tokens = usage.total_tokens
else:
logger.error("Invalid response format or missing usage data.")
input_tokens = output_tokens = total_tokens = 0
openai_stats = {}
usage = {}
processing_data = {
"processing_id": processing_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
"text_content": text_content,
"summary": summary,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"total_tokens": total_tokens,
}
try:
cv_collection.insert_one(processing_data)
logger.debug(f"Inserted processing data for ID: {processing_id}")
except Exception as e:
logger.error(
f"Failed to insert processing data into MongoDB: {e}", exc_info=True
)
else:
logger.debug("Using mockup; skipping MongoDB insertion.")
if __name__ == "__main__":
main()