Extended parsing script

This commit is contained in:
Ireneusz Bachanowicz 2025-03-02 01:40:05 +01:00
parent b27ba969d8
commit 43708d2e31
8 changed files with 195 additions and 1455 deletions

1358
.gitignore vendored

File diff suppressed because it is too large Load Diff

4
my-app/.gitignore vendored
View File

@ -1,4 +1,4 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files..next
# dependencies
/node_modules
@ -43,3 +43,5 @@ next-env.d.ts
# CV related files
*.cv
*.resume
/uploads/*

View File

@ -2,10 +2,13 @@ import 'core-js/features/promise/with-resolvers'; // Polyfill for Promise.withRe
import { NextResponse } from "next/server";
import fs from "fs";
import path from "path";
import * as pdfjsLib from 'pdfjs-dist';
import '../../../public/utils/pdf.worker.mjs';
import { v4 as uuidv4 } from 'uuid';
import { fileTypeFromBuffer } from 'file-type';
import escapeStringRegexp from 'escape-string-regexp';
import { exec as childProcessExec } from 'child_process';
import { promisify } from 'util';
const exec = promisify(childProcessExec);
const uploadDir = path.join(process.cwd(), "uploads", "cv");
@ -14,41 +17,6 @@ if (!fs.existsSync(uploadDir)) {
fs.mkdirSync(uploadDir, { recursive: true });
}
async function extractTextFromPdf(pdfPath: string): Promise<string> {
console.log("Starting extractTextFromPdf for path:", pdfPath);
try {
console.log("Reading PDF file:", pdfPath);
const data = new Uint8Array(fs.readFileSync(pdfPath));
console.log("PDF file read successfully. Starting loading document...");
const loadingTask = pdfjsLib.getDocument({ data });
console.log("Loading task initiated. Waiting for promise...");
const pdf = await loadingTask.promise; // Await the PDF loading
console.log("PDF document loaded successfully. Number of pages:", pdf.numPages);
let fullText = "";
const processPages = async () => {
for (let i = 1; i <= pdf.numPages; i++) {
console.log("Processing page:", i);
const page = await pdf.getPage(i);
console.log("Page", i, "loaded. Getting text content...");
const textContent = await page.getTextContent();
console.log("Text content for page", i, "obtained. Processing items...");
fullText += textContent.items.map((item: any) => item.str ? item.str : '').join(" ");
console.log("Text from page", i, "added to fullText.");
}
console.log("Text extraction completed successfully.");
console.log("Parsed PDF Text before return:", fullText); // Added log here
return fullText;
};
return await processPages(); // Await the page processing
} catch (error) {
console.error("Error extracting text from PDF:", error);
throw new Error("Error extracting text from PDF");
} finally {
console.log("Finished extractTextFromPdf for path:", pdfPath);
}
}
export async function POST(req: Request) {
console.log("Received request for CV file upload");
@ -56,24 +24,20 @@ export async function POST(req: Request) {
const formData = await req.formData();
const file: File | null = formData.get('cv') as unknown as File | null;
if (!file) {
console.warn("No file uploaded.");
return NextResponse.json({ message: "No file uploaded." }, { status: 400 });
}
const originalFilename = file.name;
const uniqueFilename = `${uuidv4()}-${originalFilename}`;
const newFilePath = path.join(uploadDir, uniqueFilename);
console.log(`Saving file to: ${newFilePath}`);
const fileBuffer = await file.arrayBuffer();
const type = await fileTypeFromBuffer(Buffer.from(fileBuffer));
console.log("Detected file type:", type);
if (!type || type.mime !== 'application/pdf') {
return NextResponse.json({ message: "Unsupported file type detected. Only PDF files are allowed." }, { status: 400 });
}
@ -81,35 +45,116 @@ export async function POST(req: Request) {
await fs.promises.writeFile(newFilePath, Buffer.from(fileBuffer));
console.log("File uploaded and saved successfully!");
console.log("Before PDF parsing");
const extractedText = await extractTextFromPdf(newFilePath);
console.log("After PDF parsing");
console.log("Before generating summary");
const command = `python3 utils/resume_analysis.py "${extractedText}"`;
console.log("Executing python command:", command);
console.log("Extracted Text being passed to python script:", extractedText);
console.log("Length of extractedText:", extractedText.length); // Log length
const executionResult: { stdout: string, stderr: string } = await new Promise((resolve, reject) => {
require('child_process').exec(command, (error: any, stdout: string, stderr: string) => {
if (error) {
console.error("Python script execution error:", error);
console.error("Python script stderr:", stderr);
reject({ error, stdout, stderr });
} else {
console.log("Python script executed successfully");
console.log("Python script stdout:", stdout);
resolve({ stdout, stderr });
}
});
});
const { stdout, stderr } = executionResult;
// Get the PDF file size
const pdfFileSize = fs.statSync(newFilePath).size;
console.log(`PDF file size: ${pdfFileSize} bytes`);
if (stderr) {
console.error("Error from python script (stderr):", stderr);
// Extract text from PDF using pdfminer.six
let textContent = '';
let extractedTextFilePath = '';
try {
const extractTextCommand = `pdf2txt.py "${newFilePath}"`;
const { stdout, stderr } = await exec(extractTextCommand);
textContent = stdout;
// Create extracted text file path
extractedTextFilePath = newFilePath.replace(/\.pdf$/i, ".txt");
// Write extracted text to file
fs.writeFileSync(extractedTextFilePath, textContent);
console.log(`Extracted text saved to: ${extractedTextFilePath}`);
} catch (error: any) {
console.error("Error extracting text from PDF:", error);
return NextResponse.json({ summary: "Error extracting text from PDF" }, { status: 500 });
}
const summary: string = stdout.trim();
console.log("After generating summary");
return NextResponse.json({ summary: summary }, { status: 200 });
// Execute the resume analysis script
const { spawn } = require('child_process');
const pythonProcess = spawn('python3', [path.join(process.cwd(), 'utils', 'resume_analysis.py'), "-f", extractedTextFilePath]);
let summary = '';
pythonProcess.stdout.on('data', (data: Buffer) => {
summary += data.toString();
});
pythonProcess.stderr.on('data', (data: Buffer) => {
console.error(`stderr: ${data}`);
});
let pythonProcessError = false;
let input_tokens = 0;
let output_tokens = 0;
let total_tokens = 0;
let cost = 0;
let rawOutput = "";
let openaiOutputFilePath = "";
pythonProcess.stdout.on('data', (data: Buffer) => {
const output = data.toString();
rawOutput += output;
});
pythonProcess.on('close', (code: number) => {
console.log(`child process exited with code ${code}`);
if (code !== 0) {
summary = "Error generating summary";
pythonProcessError = true;
} else {
summary = rawOutput.split("Summary: ")[1]?.split("\n--- Usage Information ---")[0] || "Error generating summary";
try {
input_tokens = parseInt(rawOutput.split("Input tokens: ")[1]?.split("\n")[0] || "0");
output_tokens = parseInt(rawOutput.split("Output tokens: ")[1]?.split("\n")[0] || "0");
total_tokens = parseInt(rawOutput.split("Total tokens: ")[1]?.split("\n")[0] || "0");
cost = parseFloat(rawOutput.split("Cost: $")[1]?.split("\n")[0] || "0");
// Create OpenAI output file path
openaiOutputFilePath = newFilePath.replace(/\.pdf$/i, "_openai.txt");
fs.writeFileSync(openaiOutputFilePath, rawOutput);
console.log(`OpenAI output saved to: ${openaiOutputFilePath}`);
} catch (e) {
console.error("Error parsing token information", e);
}
}
console.log(`--- Usage Information ---`);
console.log(`Input tokens: ${input_tokens}`);
console.log(`Output tokens: ${output_tokens}`);
console.log(`Total tokens: ${total_tokens}`);
console.log(`Cost: $${cost}`);
});
pythonProcess.stderr.on('data', (data: Buffer) => {
console.error(`stderr: ${data}`);
});
pythonProcess.on('close', (code: number) => {
console.log(`child process exited with code ${code}`);
if (code !== 0) {
summary = "Error generating summary";
pythonProcessError = true;
}
console.log(`--- Usage Information ---`);
console.log(`Input tokens: ${input_tokens}`);
console.log(`Output tokens: ${output_tokens}`);
console.log(`Total tokens: ${total_tokens}`);
console.log(`Cost: $${cost}`);
});
// Add a timeout to the python process
const timeout = setTimeout(() => {
console.error("Python process timed out");
pythonProcess.kill();
summary = "Error generating summary: Timeout";
pythonProcessError = true;
}, 10000); // 10 seconds
return new Promise((resolve) => {
pythonProcess.on('close', (code: number) => {
clearTimeout(timeout);
resolve(NextResponse.json({ summary: summary }, { status: pythonProcessError ? 500 : 200 }));
});
});
} catch (error: any) {
console.error("Error during file processing:", error);

View File

@ -83,8 +83,8 @@ export default function Home() {
return (
<div className="min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)] bg-gray-50">
<main className="flex flex-col sm:flex-row gap-8 row-start-2 ">
<div className="flex flex-col gap-8 w-full sm:w-1/2 items-center sm:items-start">
<main className="flex flex-col sm:flex-row gap-8 row-start-2 items-start">
<div className="flex flex-col gap-8 w-full sm:w-1/2 sm:items-start">
<h1 className="text-3xl font-bold text-gray-900">Welcome to Your CV Upgrade</h1>
<p className="text-lg text-center sm:text-left text-gray-700">
This platform is designed to help you enhance your CV and showcase your skills effectively.
@ -104,8 +104,8 @@ export default function Home() {
</div>
</div>
<div className="flex flex-col items-center mt-8">
<label className="mb-2 text-lg text-gray-800">Upload Your CV (PDF):</label>
<div className="flex flex-col items-start mt-8">
<h2 className="mb-2 text-2xl font-bold text-gray-900">Are you ready to pimp your CV?</h2>
<input
type="file"
accept=".pdf"
@ -113,35 +113,37 @@ export default function Home() {
className="hidden"
id="cv-upload"
/>
<label htmlFor="cv-upload" className="inline-flex items-center justify-center px-4 py-2 border border-blue-500 rounded-md shadow-sm text-sm font-medium text-blue-700 bg-white hover:bg-blue-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500 cursor-pointer">
<FaFileUpload className="mr-2" /> Upload CV
<div className="flex space-x-2">
<label htmlFor="cv-upload" className="inline-flex items-center justify-center px-4 py-2 border border-gray-500 rounded-md shadow-sm text-sm font-medium text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-gray-500 cursor-pointer disabled:opacity-50">
Upload CV
</label>
{file && <p className="mt-2 text-sm text-gray-600">Selected file: {file.name}</p>}
<button
onClick={handleSubmit}
className="mt-4 bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50"
disabled={loading}
className="inline-flex items-center justify-center px-4 py-2 border border-gray-500 rounded-md shadow-sm text-sm font-medium text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-gray-500 cursor-pointer disabled:opacity-50"
disabled={loading || !file}
>
{loading ? "Summarizing..." : "Summarize CV"}
</button>
</div>
{file && <p className="mt-2 text-sm text-gray-700 font-normal flex items-center"><FaFileUpload className="mr-2 text-gray-600 text-2xl" /> Selected file: {file.name}</p>}
</div>
</div>
{/* Right Column - CV Summary Panel */}
<div className="w-full sm:w-1/2 sm:border-l sm:border-gray-200 sm:pl-8">
<div className="p-6 bg-white rounded-md shadow-md">
<div className={`${isSummaryVisible ? 'block' : 'hidden'} p-6 rounded-md`}>
{loading ? (
<div className="animate-pulse bg-gray-100 p-6">
<div className="h-4 bg-gray-300 rounded-md mb-2"></div>
<div className="h-4 bg-gray-300 rounded-md mb-2"></div>
<div className="h-4 bg-gray-300 rounded-md"></div>
<div className="animate-pulse bg-gray-100 p-6 transition-opacity duration-500" style={{ animation: 'pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite' }}>
<div className="h-4 bg-gray-300 rounded-md mb-2"/>
<div className="h-4 bg-gray-300 rounded-md mb-2"/>
<div className="h-4 bg-gray-300 rounded-md"/>
</div>
) : (
isSummaryVisible && summary && <CvSummaryPanel summary={summary} />
summary && <CvSummaryPanel summary={summary} />
)}
</div>
</div>
</main>
<footer className=" flex flex-col items-center justify-center mt-16 p-4 border-t border-gray-200">
<footer className="flex flex-col items-center justify-center mt-16 p-4 border-t border-gray-200 absolute bottom-0 left-0 right-0 w-full">
<p className="text-center text-gray-500 text-sm mb-4">
This tool is inspired by and uses data from websites like{" "}
</p>

View File

@ -1,9 +1,15 @@
#!/usr/bin/env python3
import sys
import os
import argparse
from dotenv import load_dotenv
from openai import OpenAI
from pdfminer.high_level import extract_text
client = OpenAI()
# Load environment variables from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def analyze_resume(text):
response = client.chat.completions.create(
@ -12,14 +18,43 @@ def analyze_resume(text):
"role": "system",
"content": "Provide a concise summary of the resume, highlighting key skills and potential areas for improvement, in a few sentences."
},
{"role": "user", "content": text}]
{"role": "user", "content": text}],
max_tokens=200 # Add a max_tokens parameter to limit the output length
)
return response.choices[0].message.content
return response
if __name__ == "__main__":
if len(sys.argv) > 1:
parser = argparse.ArgumentParser(description="Analyze resume text using OpenAI.")
parser.add_argument("-f", "--file", help="Path to the file containing the resume text.")
args = parser.parse_args()
if args.file:
try:
with open(args.file, "r", encoding="latin-1") as f:
text_content = f.read()
except FileNotFoundError:
print(f"Error: File not found: {args.file}")
sys.exit(1)
elif len(sys.argv) > 1:
text_content = sys.argv[1]
summary = analyze_resume(text_content)
print(summary)
else:
print("Please provide text content as a command line argument.")
parser.print_help()
sys.exit(1)
response = analyze_resume(text_content)
summary = response.choices[0].message.content
# Print usage information
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
total_tokens = response.usage.total_tokens
print(f"Summary: {summary}")
print(f"\n--- Usage Information ---")
print(f"Input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Total tokens: {total_tokens}")
print(f"Cost: ${total_tokens * 0.000001:.6f}") # rough estimate
print("\n--- Summary from OpenAI ---")
print(f"Total tokens used: {total_tokens}")

12
package-lock.json generated
View File

@ -7,6 +7,7 @@
"dependencies": {
"@ai-sdk/google": "^1.1.17",
"ai": "^4.1.46",
"escape-string-regexp": "^5.0.0",
"zod": "^3.24.2"
}
},
@ -170,6 +171,17 @@
"resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz",
"integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw=="
},
"node_modules/escape-string-regexp": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
"integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/eventsource-parser": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.0.tgz",

View File

@ -2,6 +2,7 @@
"dependencies": {
"@ai-sdk/google": "^1.1.17",
"ai": "^4.1.46",
"escape-string-regexp": "^5.0.0",
"zod": "^3.24.2"
}
}

@ -1 +0,0 @@
Subproject commit daaad3bc174252c33eb6d185f8be21fe253cf887