diff --git a/my-app/utils/resume_analysis.py b/my-app/utils/resume_analysis.py index 0b4f0ce..b35439a 100755 --- a/my-app/utils/resume_analysis.py +++ b/my-app/utils/resume_analysis.py @@ -36,17 +36,21 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG").upper() logging.basicConfig( level=LOG_LEVEL, - format='[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s', - datefmt='%Y-%m-%dT%H:%M:%S%z' + format="[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S%z", ) + def get_mongo_collection(): """Initialize and return MongoDB collection.""" mongo_client = pymongo.MongoClient(MONGODB_URI) db = mongo_client[MONGODB_DATABASE] return db[MONGO_COLLECTION_NAME] + + logger = logging.getLogger(__name__) + def main(): """Main function to process the resume.""" parser = argparse.ArgumentParser( @@ -60,10 +64,14 @@ Required Environment Variables: usage="resume_analysis.py [-h] [-f FILE] [-m]", epilog="""Examples: Analyze a resume: resume_analysis.py -f my_resume.pdf - Test with mockup data: resume_analysis.py -f test.pdf -m""" + Test with mockup data: resume_analysis.py -f test.pdf -m""", + ) + parser.add_argument( + "-f", "--file", help="Path to the resume file to analyze (PDF or text)" + ) + parser.add_argument( + "-m", "--mockup", action="store_true", help="Use mockup response instead of calling OpenAI API" ) - parser.add_argument('-f', '--file', help='Path to the resume file to analyze (PDF or text)') - parser.add_argument('-m', '--mockup', action='store_true', help='Use mockup response instead of calling OpenAI API') # If no arguments provided, show help and exit if len(sys.argv) == 1: @@ -84,8 +92,14 @@ Required Environment Variables: sys.exit(1) start_file_read_time = time.time() - with open(args.file, 'r') as f: - resume_text = f.read() + if args.file.lower().endswith(".pdf"): + logger.debug(f"Using pdfminer to extract text from PDF: {args.file}") + resume_text = extract_text(args.file) + else: + with open( + args.file, "r", encoding="utf-8" + ) as f: # Explicitly specify utf-8 encoding for text files + resume_text = f.read() file_read_time = time.time() - start_file_read_time logger.debug(f"File read time: {file_read_time:.2f} seconds") @@ -94,15 +108,34 @@ Required Environment Variables: response = call_openai_api(resume_text, use_mockup) openai_api_time = time.time() - start_time logger.debug(f"OpenAI API call time: {openai_api_time:.2f} seconds") + # Initialize MongoDB collection only when needed cv_collection = get_mongo_collection() # Measure MongoDB insertion time start_mongo_time = time.time() - cost = insert_processing_data(resume_text, {}, response, args, str(uuid.uuid4()), use_mockup, cv_collection) + if response and response.choices: + message_content = response.choices[0].message.content + try: + summary = json.loads(message_content) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse OpenAI response: {e}") + summary = {"error": "Invalid JSON response from OpenAI"} + else: + summary = {"error": "No response from OpenAI"} + insert_processing_data( + resume_text, + summary, + response, + args, + str(uuid.uuid4()), + use_mockup, + cv_collection, + ) mongo_insert_time = time.time() - start_mongo_time logger.debug(f"MongoDB insert time: {mongo_insert_time:.2f} seconds") - write_openai_response(response, use_mockup, args.file, cost) + write_openai_response(response, use_mockup, args.file) + def load_mockup_response(mockup_file_path: str) -> dict: """Load mockup response from a JSON file.""" @@ -111,9 +144,12 @@ def load_mockup_response(mockup_file_path: str) -> dict: raise FileNotFoundError(f"Mockup file not found at: {mockup_file_path}") with open(mockup_file_path, "r") as f: response = json.load(f) - response.setdefault("openai_stats", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}) + response.setdefault( + "openai_stats", {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} + ) return response + def call_openai_api(text: str, use_mockup: bool) -> Optional[Any]: """Call OpenAI API to analyze resume text.""" logger.debug("Calling OpenAI API.") @@ -128,9 +164,9 @@ def call_openai_api(text: str, use_mockup: bool) -> Optional[Any]: model=MODEL_NAME, messages=[ {"role": "system", "content": system_content}, - {"role": "user", "content": text} + {"role": "user", "content": text}, ], - max_tokens=MAX_TOKENS + max_tokens=MAX_TOKENS, ) logger.debug(f"OpenAI API response: {response}") return response @@ -138,40 +174,49 @@ def call_openai_api(text: str, use_mockup: bool) -> Optional[Any]: logger.error(f"Error during OpenAI API call: {e}", exc_info=True) return None -def write_openai_response(response: Any, use_mockup: bool, input_file_path: str = None, cost: float = 0) -> None: # Add cost argument + +def write_openai_response( + response: Any, use_mockup: bool, input_file_path: str = None +) -> None: """Write raw OpenAI response to a file.""" if use_mockup: logger.debug("Using mockup response; no OpenAI message to write.") return - if response and response.choices: # Changed from hasattr to direct attribute access + if response and response.choices: # Changed from hasattr to direct attribute access message_content = response.choices[0].message.content logger.debug(f"Raw OpenAI message content: {message_content}") - output_dir = os.path.dirname(input_file_path) if input_file_path else '.' - base_filename = os.path.splitext(os.path.basename(input_file_path))[0] if input_file_path else "default" + output_dir = os.path.dirname(input_file_path) if input_file_path else "." + base_filename = ( + os.path.splitext(os.path.basename(input_file_path))[0] + if input_file_path + else "default" + ) processing_id = str(uuid.uuid4()) - file_path = os.path.join(output_dir, f"{base_filename}_openai_response_{processing_id}") + ".json" + file_path = os.path.join( + output_dir, f"{base_filename}_openai_response_{processing_id}" + ) + ".json" try: - serializable_response = { # Create a serializable dictionary + serializable_response = { # Create a serializable dictionary "choices": [ { "message": { "content": choice.message.content, - "role": choice.message.role + "role": choice.message.role, }, "finish_reason": choice.finish_reason, - "index": choice.index - } for choice in response.choices + "index": choice.index, + } + for choice in response.choices ], "openai_stats": { - "prompt_tokens": response.usage.prompt_tokens, - "completion_tokens": response.usage.completion_tokens, - "total_tokens": response.usage.total_tokens - }, - "cost": cost, # Include cost in the output JSON - "model": response.model + "input_tokens": response.usage.prompt_tokens, + "output_tokens": response.usage.completion_tokens, + "total_tokens": response.usage.total_tokens, + }, + "model": response.model, } with open(file_path, "w") as f: - json.dump(serializable_response, f, indent=2) # Dump the serializable dictionary + json.dump(serializable_response, f, indent=2) # Dump the serializable dictionary logger.debug(f"OpenAI response written to {file_path}") except IOError as e: logger.error(f"Failed to write OpenAI response to file: {e}") @@ -179,21 +224,22 @@ def write_openai_response(response: Any, use_mockup: bool, input_file_path: str logger.warning("No choices in OpenAI response to extract message from.") logger.debug(f"Response object: {response}") -def insert_processing_data(text_content: str, summary: dict, response: Any, args: argparse.Namespace, processing_id: str, use_mockup: bool, cv_collection) -> None: + +def insert_processing_data( + text_content: str, + summary: dict, + response: Any, + args: argparse.Namespace, + processing_id: str, + use_mockup: bool, + cv_collection, +) -> None: """Insert processing data into MongoDB.""" logger.debug("Inserting processing data into MongoDB.") if not use_mockup: if response and response.choices: message_content = response.choices[0].message.content - try: - openai_stats_content = json.loads(message_content) - openai_stats = openai_stats_content.get("openai_stats", {}) - cost = openai_stats.get("cost", 0) - except json.JSONDecodeError: - logger.error("Failed to decode JSON from message content for openai_stats.") - openai_stats = {} - cost = 0 - + openai_stats = summary.get("openai_stats", {}) usage = response.usage input_tokens = usage.prompt_tokens output_tokens = usage.completion_tokens @@ -201,34 +247,29 @@ def insert_processing_data(text_content: str, summary: dict, response: Any, args else: logger.error("Invalid response format or missing usage data.") input_tokens = output_tokens = total_tokens = 0 - cost = 0 openai_stats = {} usage = {} - processing_data = { "processing_id": processing_id, "timestamp": datetime.now(timezone.utc).isoformat(), "text_content": text_content, "summary": summary, - "usage_prompt_tokens": input_tokens, # Renamed to avoid collision - "usage_completion_tokens": output_tokens, # Renamed to avoid collision - "usage_total_tokens": total_tokens, # Renamed to avoid collision - "openai_stats_input_tokens": openai_stats.get("input_tokens"), - "openai_stats_output_tokens": openai_stats.get("output_tokens"), - "openai_stats_total_tokens": openai_stats.get("total_tokens"), - "cost": cost + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, } try: cv_collection.insert_one(processing_data) logger.debug(f"Inserted processing data for ID: {processing_id}") - return cost # Return the cost except Exception as e: - logger.error(f"Failed to insert processing data into MongoDB: {e}", exc_info=True) + logger.error( + f"Failed to insert processing data into MongoDB: {e}", exc_info=True + ) else: logger.debug("Using mockup; skipping MongoDB insertion.") - return 0 # Return 0 for mockup mode + if __name__ == "__main__": main()