Extended parsing script

This commit is contained in:
Ireneusz Bachanowicz 2025-03-02 01:40:05 +01:00
parent b27ba969d8
commit 43708d2e31
8 changed files with 195 additions and 1455 deletions

1358
.gitignore vendored

File diff suppressed because it is too large Load Diff

4
my-app/.gitignore vendored
View File

@ -1,4 +1,4 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # See https://help.github.com/articles/ignoring-files/ for more about ignoring files..next
# dependencies # dependencies
/node_modules /node_modules
@ -43,3 +43,5 @@ next-env.d.ts
# CV related files # CV related files
*.cv *.cv
*.resume *.resume
/uploads/*

View File

@ -2,10 +2,13 @@ import 'core-js/features/promise/with-resolvers'; // Polyfill for Promise.withRe
import { NextResponse } from "next/server"; import { NextResponse } from "next/server";
import fs from "fs"; import fs from "fs";
import path from "path"; import path from "path";
import * as pdfjsLib from 'pdfjs-dist';
import '../../../public/utils/pdf.worker.mjs';
import { v4 as uuidv4 } from 'uuid'; import { v4 as uuidv4 } from 'uuid';
import { fileTypeFromBuffer } from 'file-type'; import { fileTypeFromBuffer } from 'file-type';
import escapeStringRegexp from 'escape-string-regexp';
import { exec as childProcessExec } from 'child_process';
import { promisify } from 'util';
const exec = promisify(childProcessExec);
const uploadDir = path.join(process.cwd(), "uploads", "cv"); const uploadDir = path.join(process.cwd(), "uploads", "cv");
@ -14,41 +17,6 @@ if (!fs.existsSync(uploadDir)) {
fs.mkdirSync(uploadDir, { recursive: true }); fs.mkdirSync(uploadDir, { recursive: true });
} }
async function extractTextFromPdf(pdfPath: string): Promise<string> {
console.log("Starting extractTextFromPdf for path:", pdfPath);
try {
console.log("Reading PDF file:", pdfPath);
const data = new Uint8Array(fs.readFileSync(pdfPath));
console.log("PDF file read successfully. Starting loading document...");
const loadingTask = pdfjsLib.getDocument({ data });
console.log("Loading task initiated. Waiting for promise...");
const pdf = await loadingTask.promise; // Await the PDF loading
console.log("PDF document loaded successfully. Number of pages:", pdf.numPages);
let fullText = "";
const processPages = async () => {
for (let i = 1; i <= pdf.numPages; i++) {
console.log("Processing page:", i);
const page = await pdf.getPage(i);
console.log("Page", i, "loaded. Getting text content...");
const textContent = await page.getTextContent();
console.log("Text content for page", i, "obtained. Processing items...");
fullText += textContent.items.map((item: any) => item.str ? item.str : '').join(" ");
console.log("Text from page", i, "added to fullText.");
}
console.log("Text extraction completed successfully.");
console.log("Parsed PDF Text before return:", fullText); // Added log here
return fullText;
};
return await processPages(); // Await the page processing
} catch (error) {
console.error("Error extracting text from PDF:", error);
throw new Error("Error extracting text from PDF");
} finally {
console.log("Finished extractTextFromPdf for path:", pdfPath);
}
}
export async function POST(req: Request) { export async function POST(req: Request) {
console.log("Received request for CV file upload"); console.log("Received request for CV file upload");
@ -56,24 +24,20 @@ export async function POST(req: Request) {
const formData = await req.formData(); const formData = await req.formData();
const file: File | null = formData.get('cv') as unknown as File | null; const file: File | null = formData.get('cv') as unknown as File | null;
if (!file) { if (!file) {
console.warn("No file uploaded."); console.warn("No file uploaded.");
return NextResponse.json({ message: "No file uploaded." }, { status: 400 }); return NextResponse.json({ message: "No file uploaded." }, { status: 400 });
} }
const originalFilename = file.name; const originalFilename = file.name;
const uniqueFilename = `${uuidv4()}-${originalFilename}`; const uniqueFilename = `${uuidv4()}-${originalFilename}`;
const newFilePath = path.join(uploadDir, uniqueFilename); const newFilePath = path.join(uploadDir, uniqueFilename);
console.log(`Saving file to: ${newFilePath}`); console.log(`Saving file to: ${newFilePath}`);
const fileBuffer = await file.arrayBuffer(); const fileBuffer = await file.arrayBuffer();
const type = await fileTypeFromBuffer(Buffer.from(fileBuffer)); const type = await fileTypeFromBuffer(Buffer.from(fileBuffer));
console.log("Detected file type:", type); console.log("Detected file type:", type);
if (!type || type.mime !== 'application/pdf') { if (!type || type.mime !== 'application/pdf') {
return NextResponse.json({ message: "Unsupported file type detected. Only PDF files are allowed." }, { status: 400 }); return NextResponse.json({ message: "Unsupported file type detected. Only PDF files are allowed." }, { status: 400 });
} }
@ -81,35 +45,116 @@ export async function POST(req: Request) {
await fs.promises.writeFile(newFilePath, Buffer.from(fileBuffer)); await fs.promises.writeFile(newFilePath, Buffer.from(fileBuffer));
console.log("File uploaded and saved successfully!"); console.log("File uploaded and saved successfully!");
console.log("Before PDF parsing"); // Get the PDF file size
const extractedText = await extractTextFromPdf(newFilePath); const pdfFileSize = fs.statSync(newFilePath).size;
console.log("After PDF parsing"); console.log(`PDF file size: ${pdfFileSize} bytes`);
console.log("Before generating summary");
const command = `python3 utils/resume_analysis.py "${extractedText}"`; // Extract text from PDF using pdfminer.six
console.log("Executing python command:", command); let textContent = '';
console.log("Extracted Text being passed to python script:", extractedText); let extractedTextFilePath = '';
console.log("Length of extractedText:", extractedText.length); // Log length try {
const executionResult: { stdout: string, stderr: string } = await new Promise((resolve, reject) => { const extractTextCommand = `pdf2txt.py "${newFilePath}"`;
require('child_process').exec(command, (error: any, stdout: string, stderr: string) => { const { stdout, stderr } = await exec(extractTextCommand);
if (error) { textContent = stdout;
console.error("Python script execution error:", error);
console.error("Python script stderr:", stderr); // Create extracted text file path
reject({ error, stdout, stderr }); extractedTextFilePath = newFilePath.replace(/\.pdf$/i, ".txt");
} else {
console.log("Python script executed successfully"); // Write extracted text to file
console.log("Python script stdout:", stdout); fs.writeFileSync(extractedTextFilePath, textContent);
resolve({ stdout, stderr }); console.log(`Extracted text saved to: ${extractedTextFilePath}`);
} catch (error: any) {
console.error("Error extracting text from PDF:", error);
return NextResponse.json({ summary: "Error extracting text from PDF" }, { status: 500 });
}
// Execute the resume analysis script
const { spawn } = require('child_process');
const pythonProcess = spawn('python3', [path.join(process.cwd(), 'utils', 'resume_analysis.py'), "-f", extractedTextFilePath]);
let summary = '';
pythonProcess.stdout.on('data', (data: Buffer) => {
summary += data.toString();
});
pythonProcess.stderr.on('data', (data: Buffer) => {
console.error(`stderr: ${data}`);
});
let pythonProcessError = false;
let input_tokens = 0;
let output_tokens = 0;
let total_tokens = 0;
let cost = 0;
let rawOutput = "";
let openaiOutputFilePath = "";
pythonProcess.stdout.on('data', (data: Buffer) => {
const output = data.toString();
rawOutput += output;
});
pythonProcess.on('close', (code: number) => {
console.log(`child process exited with code ${code}`);
if (code !== 0) {
summary = "Error generating summary";
pythonProcessError = true;
} else {
summary = rawOutput.split("Summary: ")[1]?.split("\n--- Usage Information ---")[0] || "Error generating summary";
try {
input_tokens = parseInt(rawOutput.split("Input tokens: ")[1]?.split("\n")[0] || "0");
output_tokens = parseInt(rawOutput.split("Output tokens: ")[1]?.split("\n")[0] || "0");
total_tokens = parseInt(rawOutput.split("Total tokens: ")[1]?.split("\n")[0] || "0");
cost = parseFloat(rawOutput.split("Cost: $")[1]?.split("\n")[0] || "0");
// Create OpenAI output file path
openaiOutputFilePath = newFilePath.replace(/\.pdf$/i, "_openai.txt");
fs.writeFileSync(openaiOutputFilePath, rawOutput);
console.log(`OpenAI output saved to: ${openaiOutputFilePath}`);
} catch (e) {
console.error("Error parsing token information", e);
} }
}
console.log(`--- Usage Information ---`);
console.log(`Input tokens: ${input_tokens}`);
console.log(`Output tokens: ${output_tokens}`);
console.log(`Total tokens: ${total_tokens}`);
console.log(`Cost: $${cost}`);
});
pythonProcess.stderr.on('data', (data: Buffer) => {
console.error(`stderr: ${data}`);
});
pythonProcess.on('close', (code: number) => {
console.log(`child process exited with code ${code}`);
if (code !== 0) {
summary = "Error generating summary";
pythonProcessError = true;
}
console.log(`--- Usage Information ---`);
console.log(`Input tokens: ${input_tokens}`);
console.log(`Output tokens: ${output_tokens}`);
console.log(`Total tokens: ${total_tokens}`);
console.log(`Cost: $${cost}`);
});
// Add a timeout to the python process
const timeout = setTimeout(() => {
console.error("Python process timed out");
pythonProcess.kill();
summary = "Error generating summary: Timeout";
pythonProcessError = true;
}, 10000); // 10 seconds
return new Promise((resolve) => {
pythonProcess.on('close', (code: number) => {
clearTimeout(timeout);
resolve(NextResponse.json({ summary: summary }, { status: pythonProcessError ? 500 : 200 }));
}); });
}); });
const { stdout, stderr } = executionResult;
if (stderr) {
console.error("Error from python script (stderr):", stderr);
}
const summary: string = stdout.trim();
console.log("After generating summary");
return NextResponse.json({ summary: summary }, { status: 200 });
} catch (error: any) { } catch (error: any) {
console.error("Error during file processing:", error); console.error("Error during file processing:", error);

View File

@ -83,8 +83,8 @@ export default function Home() {
return ( return (
<div className="min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)] bg-gray-50"> <div className="min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)] bg-gray-50">
<main className="flex flex-col sm:flex-row gap-8 row-start-2 "> <main className="flex flex-col sm:flex-row gap-8 row-start-2 items-start">
<div className="flex flex-col gap-8 w-full sm:w-1/2 items-center sm:items-start"> <div className="flex flex-col gap-8 w-full sm:w-1/2 sm:items-start">
<h1 className="text-3xl font-bold text-gray-900">Welcome to Your CV Upgrade</h1> <h1 className="text-3xl font-bold text-gray-900">Welcome to Your CV Upgrade</h1>
<p className="text-lg text-center sm:text-left text-gray-700"> <p className="text-lg text-center sm:text-left text-gray-700">
This platform is designed to help you enhance your CV and showcase your skills effectively. This platform is designed to help you enhance your CV and showcase your skills effectively.
@ -104,8 +104,8 @@ export default function Home() {
</div> </div>
</div> </div>
<div className="flex flex-col items-center mt-8"> <div className="flex flex-col items-start mt-8">
<label className="mb-2 text-lg text-gray-800">Upload Your CV (PDF):</label> <h2 className="mb-2 text-2xl font-bold text-gray-900">Are you ready to pimp your CV?</h2>
<input <input
type="file" type="file"
accept=".pdf" accept=".pdf"
@ -113,35 +113,37 @@ export default function Home() {
className="hidden" className="hidden"
id="cv-upload" id="cv-upload"
/> />
<label htmlFor="cv-upload" className="inline-flex items-center justify-center px-4 py-2 border border-blue-500 rounded-md shadow-sm text-sm font-medium text-blue-700 bg-white hover:bg-blue-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500 cursor-pointer"> <div className="flex space-x-2">
<FaFileUpload className="mr-2" /> Upload CV <label htmlFor="cv-upload" className="inline-flex items-center justify-center px-4 py-2 border border-gray-500 rounded-md shadow-sm text-sm font-medium text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-gray-500 cursor-pointer disabled:opacity-50">
</label> Upload CV
{file && <p className="mt-2 text-sm text-gray-600">Selected file: {file.name}</p>} </label>
<button <button
onClick={handleSubmit} onClick={handleSubmit}
className="mt-4 bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50" className="inline-flex items-center justify-center px-4 py-2 border border-gray-500 rounded-md shadow-sm text-sm font-medium text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-gray-500 cursor-pointer disabled:opacity-50"
disabled={loading} disabled={loading || !file}
> >
{loading ? "Summarizing..." : "Summarize CV"} {loading ? "Summarizing..." : "Summarize CV"}
</button> </button>
</div>
{file && <p className="mt-2 text-sm text-gray-700 font-normal flex items-center"><FaFileUpload className="mr-2 text-gray-600 text-2xl" /> Selected file: {file.name}</p>}
</div> </div>
</div> </div>
{/* Right Column - CV Summary Panel */} {/* Right Column - CV Summary Panel */}
<div className="w-full sm:w-1/2 sm:border-l sm:border-gray-200 sm:pl-8"> <div className="w-full sm:w-1/2 sm:border-l sm:border-gray-200 sm:pl-8">
<div className="p-6 bg-white rounded-md shadow-md"> <div className={`${isSummaryVisible ? 'block' : 'hidden'} p-6 rounded-md`}>
{loading ? ( {loading ? (
<div className="animate-pulse bg-gray-100 p-6"> <div className="animate-pulse bg-gray-100 p-6 transition-opacity duration-500" style={{ animation: 'pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite' }}>
<div className="h-4 bg-gray-300 rounded-md mb-2"></div> <div className="h-4 bg-gray-300 rounded-md mb-2"/>
<div className="h-4 bg-gray-300 rounded-md mb-2"></div> <div className="h-4 bg-gray-300 rounded-md mb-2"/>
<div className="h-4 bg-gray-300 rounded-md"></div> <div className="h-4 bg-gray-300 rounded-md"/>
</div> </div>
) : ( ) : (
isSummaryVisible && summary && <CvSummaryPanel summary={summary} /> summary && <CvSummaryPanel summary={summary} />
)} )}
</div> </div>
</div> </div>
</main> </main>
<footer className=" flex flex-col items-center justify-center mt-16 p-4 border-t border-gray-200"> <footer className="flex flex-col items-center justify-center mt-16 p-4 border-t border-gray-200 absolute bottom-0 left-0 right-0 w-full">
<p className="text-center text-gray-500 text-sm mb-4"> <p className="text-center text-gray-500 text-sm mb-4">
This tool is inspired by and uses data from websites like{" "} This tool is inspired by and uses data from websites like{" "}
</p> </p>

View File

@ -1,9 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import sys
import os
import argparse
from dotenv import load_dotenv
from openai import OpenAI from openai import OpenAI
from pdfminer.high_level import extract_text from pdfminer.high_level import extract_text
client = OpenAI() # Load environment variables from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def analyze_resume(text): def analyze_resume(text):
response = client.chat.completions.create( response = client.chat.completions.create(
@ -12,14 +18,43 @@ def analyze_resume(text):
"role": "system", "role": "system",
"content": "Provide a concise summary of the resume, highlighting key skills and potential areas for improvement, in a few sentences." "content": "Provide a concise summary of the resume, highlighting key skills and potential areas for improvement, in a few sentences."
}, },
{"role": "user", "content": text}] {"role": "user", "content": text}],
max_tokens=200 # Add a max_tokens parameter to limit the output length
) )
return response.choices[0].message.content return response
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) > 1: parser = argparse.ArgumentParser(description="Analyze resume text using OpenAI.")
parser.add_argument("-f", "--file", help="Path to the file containing the resume text.")
args = parser.parse_args()
if args.file:
try:
with open(args.file, "r", encoding="latin-1") as f:
text_content = f.read()
except FileNotFoundError:
print(f"Error: File not found: {args.file}")
sys.exit(1)
elif len(sys.argv) > 1:
text_content = sys.argv[1] text_content = sys.argv[1]
summary = analyze_resume(text_content)
print(summary)
else: else:
print("Please provide text content as a command line argument.") parser.print_help()
sys.exit(1)
response = analyze_resume(text_content)
summary = response.choices[0].message.content
# Print usage information
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
total_tokens = response.usage.total_tokens
print(f"Summary: {summary}")
print(f"\n--- Usage Information ---")
print(f"Input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Total tokens: {total_tokens}")
print(f"Cost: ${total_tokens * 0.000001:.6f}") # rough estimate
print("\n--- Summary from OpenAI ---")
print(f"Total tokens used: {total_tokens}")

12
package-lock.json generated
View File

@ -7,6 +7,7 @@
"dependencies": { "dependencies": {
"@ai-sdk/google": "^1.1.17", "@ai-sdk/google": "^1.1.17",
"ai": "^4.1.46", "ai": "^4.1.46",
"escape-string-regexp": "^5.0.0",
"zod": "^3.24.2" "zod": "^3.24.2"
} }
}, },
@ -170,6 +171,17 @@
"resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz", "resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz",
"integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==" "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw=="
}, },
"node_modules/escape-string-regexp": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
"integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/eventsource-parser": { "node_modules/eventsource-parser": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.0.tgz", "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.0.tgz",

View File

@ -2,6 +2,7 @@
"dependencies": { "dependencies": {
"@ai-sdk/google": "^1.1.17", "@ai-sdk/google": "^1.1.17",
"ai": "^4.1.46", "ai": "^4.1.46",
"escape-string-regexp": "^5.0.0",
"zod": "^3.24.2" "zod": "^3.24.2"
} }
} }

@ -1 +0,0 @@
Subproject commit daaad3bc174252c33eb6d185f8be21fe253cf887