Coś tam lepiej

2025-03-14 01:46:55 +01:00 · 2025-03-14 01:46:55 +01:00 · 159f78ccb5
commit 159f78ccb5
parent aadf1fe94c
1 changed files with 91 additions and 50 deletions
--- a/my-app/utils/resume_analysis.py
+++ b/my-app/utils/resume_analysis.py
@ -36,17 +36,21 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG").upper()
 logging.basicConfig(
    level=LOG_LEVEL,
-    format='[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s',
+    format="[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s",
-    datefmt='%Y-%m-%dT%H:%M:%S%z'
+    datefmt="%Y-%m-%dT%H:%M:%S%z",
 )
 def get_mongo_collection():
    """Initialize and return MongoDB collection."""
    mongo_client = pymongo.MongoClient(MONGODB_URI)
    db = mongo_client[MONGODB_DATABASE]
    return db[MONGO_COLLECTION_NAME]
 logger = logging.getLogger(__name__)
 def main():
    """Main function to process the resume."""
    parser = argparse.ArgumentParser(
@ -60,10 +64,14 @@ Required Environment Variables:
        usage="resume_analysis.py [-h] [-f FILE] [-m]",
        epilog="""Examples:
  Analyze a resume:        resume_analysis.py -f my_resume.pdf
-  Test with mockup data:   resume_analysis.py -f test.pdf -m"""
+  Test with mockup data:   resume_analysis.py -f test.pdf -m""",
    )
    parser.add_argument(
        "-f", "--file", help="Path to the resume file to analyze (PDF or text)"
    )
    parser.add_argument(
        "-m", "--mockup", action="store_true", help="Use mockup response instead of calling OpenAI API"
    )
    parser.add_argument('-f', '--file', help='Path to the resume file to analyze (PDF or text)')
    parser.add_argument('-m', '--mockup', action='store_true', help='Use mockup response instead of calling OpenAI API')
    # If no arguments provided, show help and exit
    if len(sys.argv) == 1:
@ -84,8 +92,14 @@ Required Environment Variables:
            sys.exit(1)
        start_file_read_time = time.time()
-        with open(args.file, 'r') as f:
+        if args.file.lower().endswith(".pdf"):
-            resume_text = f.read()
+            logger.debug(f"Using pdfminer to extract text from PDF: {args.file}")
            resume_text = extract_text(args.file)
        else:
            with open(
                args.file, "r", encoding="utf-8"
            ) as f:  # Explicitly specify utf-8 encoding for text files
                resume_text = f.read()
        file_read_time = time.time() - start_file_read_time
        logger.debug(f"File read time: {file_read_time:.2f} seconds")
@ -94,15 +108,34 @@ Required Environment Variables:
    response = call_openai_api(resume_text, use_mockup)
    openai_api_time = time.time() - start_time
    logger.debug(f"OpenAI API call time: {openai_api_time:.2f} seconds")
    # Initialize MongoDB collection only when needed
    cv_collection = get_mongo_collection()
    # Measure MongoDB insertion time
    start_mongo_time = time.time()
-    cost = insert_processing_data(resume_text, {}, response, args, str(uuid.uuid4()), use_mockup, cv_collection)
+    if response and response.choices:
        message_content = response.choices[0].message.content
        try:
            summary = json.loads(message_content)
        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse OpenAI response: {e}")
            summary = {"error": "Invalid JSON response from OpenAI"}
    else:
        summary = {"error": "No response from OpenAI"}
    insert_processing_data(
        resume_text,
        summary,
        response,
        args,
        str(uuid.uuid4()),
        use_mockup,
        cv_collection,
    )
    mongo_insert_time = time.time() - start_mongo_time
    logger.debug(f"MongoDB insert time: {mongo_insert_time:.2f} seconds")
-    write_openai_response(response, use_mockup, args.file, cost)
+    write_openai_response(response, use_mockup, args.file)
 def load_mockup_response(mockup_file_path: str) -> dict:
    """Load mockup response from a JSON file."""
@ -111,9 +144,12 @@ def load_mockup_response(mockup_file_path: str) -> dict:
        raise FileNotFoundError(f"Mockup file not found at: {mockup_file_path}")
    with open(mockup_file_path, "r") as f:
        response = json.load(f)
-    response.setdefault("openai_stats", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0})
+    response.setdefault(
        "openai_stats", {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
    )
    return response
 def call_openai_api(text: str, use_mockup: bool) -> Optional[Any]:
    """Call OpenAI API to analyze resume text."""
    logger.debug("Calling OpenAI API.")
@ -128,9 +164,9 @@ def call_openai_api(text: str, use_mockup: bool) -> Optional[Any]:
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_content},
-                {"role": "user", "content": text}
+                {"role": "user", "content": text},
            ],
-            max_tokens=MAX_TOKENS
+            max_tokens=MAX_TOKENS,
        )
        logger.debug(f"OpenAI API response: {response}")
        return response
@ -138,40 +174,49 @@ def call_openai_api(text: str, use_mockup: bool) -> Optional[Any]:
        logger.error(f"Error during OpenAI API call: {e}", exc_info=True)
        return None
-def write_openai_response(response: Any, use_mockup: bool, input_file_path: str = None, cost: float = 0) -> None: # Add cost argument
+
 def write_openai_response(
    response: Any, use_mockup: bool, input_file_path: str = None
 ) -> None:  
    """Write raw OpenAI response to a file."""
    if use_mockup:
        logger.debug("Using mockup response; no OpenAI message to write.")
        return
-    if response and response.choices: # Changed from hasattr to direct attribute access
+    if response and response.choices:  # Changed from hasattr to direct attribute access
        message_content = response.choices[0].message.content
        logger.debug(f"Raw OpenAI message content: {message_content}")
-        output_dir = os.path.dirname(input_file_path) if input_file_path else '.'
+        output_dir = os.path.dirname(input_file_path) if input_file_path else "."
-        base_filename = os.path.splitext(os.path.basename(input_file_path))[0] if input_file_path else "default"
+        base_filename = (
            os.path.splitext(os.path.basename(input_file_path))[0]
            if input_file_path
            else "default"
        )
        processing_id = str(uuid.uuid4())
-        file_path = os.path.join(output_dir, f"{base_filename}_openai_response_{processing_id}") + ".json"
+        file_path = os.path.join(
            output_dir, f"{base_filename}_openai_response_{processing_id}"
        ) + ".json"
        try:
-            serializable_response = { # Create a serializable dictionary
+            serializable_response = {  # Create a serializable dictionary
                "choices": [
                    {
                        "message": {
                            "content": choice.message.content,
-                            "role": choice.message.role
+                            "role": choice.message.role,
                        },
                        "finish_reason": choice.finish_reason,
-                        "index": choice.index
+                        "index": choice.index,
-                    } for choice in response.choices
+                    }
                    for choice in response.choices
                ],
                "openai_stats": {
-                    "prompt_tokens": response.usage.prompt_tokens,
+                    "input_tokens": response.usage.prompt_tokens,
-                    "completion_tokens": response.usage.completion_tokens,
+                    "output_tokens": response.usage.completion_tokens,
-                    "total_tokens": response.usage.total_tokens
+                    "total_tokens": response.usage.total_tokens,
-                 },
+                },
-                "cost": cost, # Include cost in the output JSON
+                "model": response.model,
                "model": response.model
            }
            with open(file_path, "w") as f:
-                json.dump(serializable_response, f, indent=2) # Dump the serializable dictionary
+                json.dump(serializable_response, f, indent=2)  # Dump the serializable dictionary
            logger.debug(f"OpenAI response written to {file_path}")
        except IOError as e:
            logger.error(f"Failed to write OpenAI response to file: {e}")
@ -179,21 +224,22 @@ def write_openai_response(response: Any, use_mockup: bool, input_file_path: str
        logger.warning("No choices in OpenAI response to extract message from.")
        logger.debug(f"Response object: {response}")
-def insert_processing_data(text_content: str, summary: dict, response: Any, args: argparse.Namespace, processing_id: str, use_mockup: bool, cv_collection) -> None:
+
 def insert_processing_data(
    text_content: str,
    summary: dict,
    response: Any,
    args: argparse.Namespace,
    processing_id: str,
    use_mockup: bool,
    cv_collection,
 ) -> None:
    """Insert processing data into MongoDB."""
    logger.debug("Inserting processing data into MongoDB.")
    if not use_mockup:
        if response and response.choices:
            message_content = response.choices[0].message.content
-            try:
+            openai_stats = summary.get("openai_stats", {})
                openai_stats_content = json.loads(message_content)
                openai_stats = openai_stats_content.get("openai_stats", {})
                cost = openai_stats.get("cost", 0)
            except json.JSONDecodeError:
                logger.error("Failed to decode JSON from message content for openai_stats.")
                openai_stats = {}
                cost = 0
            usage = response.usage
            input_tokens = usage.prompt_tokens
            output_tokens = usage.completion_tokens
@ -201,34 +247,29 @@ def insert_processing_data(text_content: str, summary: dict, response: Any, args
        else:
            logger.error("Invalid response format or missing usage data.")
            input_tokens = output_tokens = total_tokens = 0
            cost = 0
            openai_stats = {}
            usage = {}
        processing_data = {
            "processing_id": processing_id,
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "text_content": text_content,
            "summary": summary,
-            "usage_prompt_tokens": input_tokens, # Renamed to avoid collision
+            "input_tokens": input_tokens,
-            "usage_completion_tokens": output_tokens, # Renamed to avoid collision
+            "output_tokens": output_tokens,
-            "usage_total_tokens": total_tokens, # Renamed to avoid collision
+            "total_tokens": total_tokens,
            "openai_stats_input_tokens": openai_stats.get("input_tokens"),
            "openai_stats_output_tokens": openai_stats.get("output_tokens"),
            "openai_stats_total_tokens": openai_stats.get("total_tokens"),
            "cost": cost
        }
        try:
            cv_collection.insert_one(processing_data)
            logger.debug(f"Inserted processing data for ID: {processing_id}")
            return cost # Return the cost
        except Exception as e:
-            logger.error(f"Failed to insert processing data into MongoDB: {e}", exc_info=True)
+            logger.error(
                f"Failed to insert processing data into MongoDB: {e}", exc_info=True
            )
    else:
        logger.debug("Using mockup; skipping MongoDB insertion.")
-    return 0 # Return 0 for mockup mode
+
 if __name__ == "__main__":
    main()