Extended parsing script
This commit is contained in:
parent
b27ba969d8
commit
43708d2e31
1358
.gitignore
vendored
1358
.gitignore
vendored
File diff suppressed because it is too large
Load Diff
4
my-app/.gitignore
vendored
4
my-app/.gitignore
vendored
@ -1,4 +1,4 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files..next
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
@ -43,3 +43,5 @@ next-env.d.ts
|
||||
# CV related files
|
||||
*.cv
|
||||
*.resume
|
||||
|
||||
/uploads/*
|
||||
@ -2,10 +2,13 @@ import 'core-js/features/promise/with-resolvers'; // Polyfill for Promise.withRe
|
||||
import { NextResponse } from "next/server";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import '../../../public/utils/pdf.worker.mjs';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { fileTypeFromBuffer } from 'file-type';
|
||||
import escapeStringRegexp from 'escape-string-regexp';
|
||||
import { exec as childProcessExec } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
|
||||
const exec = promisify(childProcessExec);
|
||||
|
||||
const uploadDir = path.join(process.cwd(), "uploads", "cv");
|
||||
|
||||
@ -14,41 +17,6 @@ if (!fs.existsSync(uploadDir)) {
|
||||
fs.mkdirSync(uploadDir, { recursive: true });
|
||||
}
|
||||
|
||||
async function extractTextFromPdf(pdfPath: string): Promise<string> {
|
||||
console.log("Starting extractTextFromPdf for path:", pdfPath);
|
||||
try {
|
||||
console.log("Reading PDF file:", pdfPath);
|
||||
const data = new Uint8Array(fs.readFileSync(pdfPath));
|
||||
console.log("PDF file read successfully. Starting loading document...");
|
||||
const loadingTask = pdfjsLib.getDocument({ data });
|
||||
console.log("Loading task initiated. Waiting for promise...");
|
||||
const pdf = await loadingTask.promise; // Await the PDF loading
|
||||
console.log("PDF document loaded successfully. Number of pages:", pdf.numPages);
|
||||
let fullText = "";
|
||||
const processPages = async () => {
|
||||
for (let i = 1; i <= pdf.numPages; i++) {
|
||||
console.log("Processing page:", i);
|
||||
const page = await pdf.getPage(i);
|
||||
console.log("Page", i, "loaded. Getting text content...");
|
||||
const textContent = await page.getTextContent();
|
||||
console.log("Text content for page", i, "obtained. Processing items...");
|
||||
fullText += textContent.items.map((item: any) => item.str ? item.str : '').join(" ");
|
||||
console.log("Text from page", i, "added to fullText.");
|
||||
}
|
||||
console.log("Text extraction completed successfully.");
|
||||
console.log("Parsed PDF Text before return:", fullText); // Added log here
|
||||
return fullText;
|
||||
};
|
||||
return await processPages(); // Await the page processing
|
||||
} catch (error) {
|
||||
console.error("Error extracting text from PDF:", error);
|
||||
throw new Error("Error extracting text from PDF");
|
||||
} finally {
|
||||
console.log("Finished extractTextFromPdf for path:", pdfPath);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export async function POST(req: Request) {
|
||||
console.log("Received request for CV file upload");
|
||||
|
||||
@ -56,24 +24,20 @@ export async function POST(req: Request) {
|
||||
const formData = await req.formData();
|
||||
const file: File | null = formData.get('cv') as unknown as File | null;
|
||||
|
||||
|
||||
if (!file) {
|
||||
console.warn("No file uploaded.");
|
||||
return NextResponse.json({ message: "No file uploaded." }, { status: 400 });
|
||||
}
|
||||
|
||||
|
||||
const originalFilename = file.name;
|
||||
const uniqueFilename = `${uuidv4()}-${originalFilename}`;
|
||||
const newFilePath = path.join(uploadDir, uniqueFilename);
|
||||
console.log(`Saving file to: ${newFilePath}`);
|
||||
|
||||
|
||||
const fileBuffer = await file.arrayBuffer();
|
||||
const type = await fileTypeFromBuffer(Buffer.from(fileBuffer));
|
||||
console.log("Detected file type:", type);
|
||||
|
||||
|
||||
if (!type || type.mime !== 'application/pdf') {
|
||||
return NextResponse.json({ message: "Unsupported file type detected. Only PDF files are allowed." }, { status: 400 });
|
||||
}
|
||||
@ -81,35 +45,116 @@ export async function POST(req: Request) {
|
||||
await fs.promises.writeFile(newFilePath, Buffer.from(fileBuffer));
|
||||
console.log("File uploaded and saved successfully!");
|
||||
|
||||
console.log("Before PDF parsing");
|
||||
const extractedText = await extractTextFromPdf(newFilePath);
|
||||
console.log("After PDF parsing");
|
||||
console.log("Before generating summary");
|
||||
const command = `python3 utils/resume_analysis.py "${extractedText}"`;
|
||||
console.log("Executing python command:", command);
|
||||
console.log("Extracted Text being passed to python script:", extractedText);
|
||||
console.log("Length of extractedText:", extractedText.length); // Log length
|
||||
const executionResult: { stdout: string, stderr: string } = await new Promise((resolve, reject) => {
|
||||
require('child_process').exec(command, (error: any, stdout: string, stderr: string) => {
|
||||
if (error) {
|
||||
console.error("Python script execution error:", error);
|
||||
console.error("Python script stderr:", stderr);
|
||||
reject({ error, stdout, stderr });
|
||||
} else {
|
||||
console.log("Python script executed successfully");
|
||||
console.log("Python script stdout:", stdout);
|
||||
resolve({ stdout, stderr });
|
||||
}
|
||||
});
|
||||
});
|
||||
const { stdout, stderr } = executionResult;
|
||||
// Get the PDF file size
|
||||
const pdfFileSize = fs.statSync(newFilePath).size;
|
||||
console.log(`PDF file size: ${pdfFileSize} bytes`);
|
||||
|
||||
if (stderr) {
|
||||
console.error("Error from python script (stderr):", stderr);
|
||||
// Extract text from PDF using pdfminer.six
|
||||
let textContent = '';
|
||||
let extractedTextFilePath = '';
|
||||
try {
|
||||
const extractTextCommand = `pdf2txt.py "${newFilePath}"`;
|
||||
const { stdout, stderr } = await exec(extractTextCommand);
|
||||
textContent = stdout;
|
||||
|
||||
// Create extracted text file path
|
||||
extractedTextFilePath = newFilePath.replace(/\.pdf$/i, ".txt");
|
||||
|
||||
// Write extracted text to file
|
||||
fs.writeFileSync(extractedTextFilePath, textContent);
|
||||
console.log(`Extracted text saved to: ${extractedTextFilePath}`);
|
||||
|
||||
} catch (error: any) {
|
||||
console.error("Error extracting text from PDF:", error);
|
||||
return NextResponse.json({ summary: "Error extracting text from PDF" }, { status: 500 });
|
||||
}
|
||||
const summary: string = stdout.trim();
|
||||
console.log("After generating summary");
|
||||
return NextResponse.json({ summary: summary }, { status: 200 });
|
||||
|
||||
// Execute the resume analysis script
|
||||
const { spawn } = require('child_process');
|
||||
const pythonProcess = spawn('python3', [path.join(process.cwd(), 'utils', 'resume_analysis.py'), "-f", extractedTextFilePath]);
|
||||
|
||||
let summary = '';
|
||||
pythonProcess.stdout.on('data', (data: Buffer) => {
|
||||
summary += data.toString();
|
||||
});
|
||||
|
||||
pythonProcess.stderr.on('data', (data: Buffer) => {
|
||||
console.error(`stderr: ${data}`);
|
||||
});
|
||||
|
||||
let pythonProcessError = false;
|
||||
let input_tokens = 0;
|
||||
let output_tokens = 0;
|
||||
let total_tokens = 0;
|
||||
let cost = 0;
|
||||
let rawOutput = "";
|
||||
let openaiOutputFilePath = "";
|
||||
|
||||
pythonProcess.stdout.on('data', (data: Buffer) => {
|
||||
const output = data.toString();
|
||||
rawOutput += output;
|
||||
});
|
||||
|
||||
pythonProcess.on('close', (code: number) => {
|
||||
console.log(`child process exited with code ${code}`);
|
||||
if (code !== 0) {
|
||||
summary = "Error generating summary";
|
||||
pythonProcessError = true;
|
||||
} else {
|
||||
summary = rawOutput.split("Summary: ")[1]?.split("\n--- Usage Information ---")[0] || "Error generating summary";
|
||||
try {
|
||||
input_tokens = parseInt(rawOutput.split("Input tokens: ")[1]?.split("\n")[0] || "0");
|
||||
output_tokens = parseInt(rawOutput.split("Output tokens: ")[1]?.split("\n")[0] || "0");
|
||||
total_tokens = parseInt(rawOutput.split("Total tokens: ")[1]?.split("\n")[0] || "0");
|
||||
cost = parseFloat(rawOutput.split("Cost: $")[1]?.split("\n")[0] || "0");
|
||||
|
||||
// Create OpenAI output file path
|
||||
openaiOutputFilePath = newFilePath.replace(/\.pdf$/i, "_openai.txt");
|
||||
fs.writeFileSync(openaiOutputFilePath, rawOutput);
|
||||
console.log(`OpenAI output saved to: ${openaiOutputFilePath}`);
|
||||
|
||||
} catch (e) {
|
||||
console.error("Error parsing token information", e);
|
||||
}
|
||||
}
|
||||
console.log(`--- Usage Information ---`);
|
||||
console.log(`Input tokens: ${input_tokens}`);
|
||||
console.log(`Output tokens: ${output_tokens}`);
|
||||
console.log(`Total tokens: ${total_tokens}`);
|
||||
console.log(`Cost: $${cost}`);
|
||||
});
|
||||
|
||||
pythonProcess.stderr.on('data', (data: Buffer) => {
|
||||
console.error(`stderr: ${data}`);
|
||||
});
|
||||
|
||||
pythonProcess.on('close', (code: number) => {
|
||||
console.log(`child process exited with code ${code}`);
|
||||
if (code !== 0) {
|
||||
summary = "Error generating summary";
|
||||
pythonProcessError = true;
|
||||
}
|
||||
console.log(`--- Usage Information ---`);
|
||||
console.log(`Input tokens: ${input_tokens}`);
|
||||
console.log(`Output tokens: ${output_tokens}`);
|
||||
console.log(`Total tokens: ${total_tokens}`);
|
||||
console.log(`Cost: $${cost}`);
|
||||
});
|
||||
|
||||
// Add a timeout to the python process
|
||||
const timeout = setTimeout(() => {
|
||||
console.error("Python process timed out");
|
||||
pythonProcess.kill();
|
||||
summary = "Error generating summary: Timeout";
|
||||
pythonProcessError = true;
|
||||
}, 10000); // 10 seconds
|
||||
|
||||
return new Promise((resolve) => {
|
||||
pythonProcess.on('close', (code: number) => {
|
||||
clearTimeout(timeout);
|
||||
resolve(NextResponse.json({ summary: summary }, { status: pythonProcessError ? 500 : 200 }));
|
||||
});
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
console.error("Error during file processing:", error);
|
||||
|
||||
@ -83,8 +83,8 @@ export default function Home() {
|
||||
|
||||
return (
|
||||
<div className="min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)] bg-gray-50">
|
||||
<main className="flex flex-col sm:flex-row gap-8 row-start-2 ">
|
||||
<div className="flex flex-col gap-8 w-full sm:w-1/2 items-center sm:items-start">
|
||||
<main className="flex flex-col sm:flex-row gap-8 row-start-2 items-start">
|
||||
<div className="flex flex-col gap-8 w-full sm:w-1/2 sm:items-start">
|
||||
<h1 className="text-3xl font-bold text-gray-900">Welcome to Your CV Upgrade</h1>
|
||||
<p className="text-lg text-center sm:text-left text-gray-700">
|
||||
This platform is designed to help you enhance your CV and showcase your skills effectively.
|
||||
@ -104,8 +104,8 @@ export default function Home() {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col items-center mt-8">
|
||||
<label className="mb-2 text-lg text-gray-800">Upload Your CV (PDF):</label>
|
||||
<div className="flex flex-col items-start mt-8">
|
||||
<h2 className="mb-2 text-2xl font-bold text-gray-900">Are you ready to pimp your CV?</h2>
|
||||
<input
|
||||
type="file"
|
||||
accept=".pdf"
|
||||
@ -113,35 +113,37 @@ export default function Home() {
|
||||
className="hidden"
|
||||
id="cv-upload"
|
||||
/>
|
||||
<label htmlFor="cv-upload" className="inline-flex items-center justify-center px-4 py-2 border border-blue-500 rounded-md shadow-sm text-sm font-medium text-blue-700 bg-white hover:bg-blue-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500 cursor-pointer">
|
||||
<FaFileUpload className="mr-2" /> Upload CV
|
||||
<div className="flex space-x-2">
|
||||
<label htmlFor="cv-upload" className="inline-flex items-center justify-center px-4 py-2 border border-gray-500 rounded-md shadow-sm text-sm font-medium text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-gray-500 cursor-pointer disabled:opacity-50">
|
||||
Upload CV
|
||||
</label>
|
||||
{file && <p className="mt-2 text-sm text-gray-600">Selected file: {file.name}</p>}
|
||||
<button
|
||||
onClick={handleSubmit}
|
||||
className="mt-4 bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50"
|
||||
disabled={loading}
|
||||
className="inline-flex items-center justify-center px-4 py-2 border border-gray-500 rounded-md shadow-sm text-sm font-medium text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-gray-500 cursor-pointer disabled:opacity-50"
|
||||
disabled={loading || !file}
|
||||
>
|
||||
{loading ? "Summarizing..." : "Summarize CV"}
|
||||
</button>
|
||||
</div>
|
||||
{file && <p className="mt-2 text-sm text-gray-700 font-normal flex items-center"><FaFileUpload className="mr-2 text-gray-600 text-2xl" /> Selected file: {file.name}</p>}
|
||||
</div>
|
||||
</div>
|
||||
{/* Right Column - CV Summary Panel */}
|
||||
<div className="w-full sm:w-1/2 sm:border-l sm:border-gray-200 sm:pl-8">
|
||||
<div className="p-6 bg-white rounded-md shadow-md">
|
||||
<div className={`${isSummaryVisible ? 'block' : 'hidden'} p-6 rounded-md`}>
|
||||
{loading ? (
|
||||
<div className="animate-pulse bg-gray-100 p-6">
|
||||
<div className="h-4 bg-gray-300 rounded-md mb-2"></div>
|
||||
<div className="h-4 bg-gray-300 rounded-md mb-2"></div>
|
||||
<div className="h-4 bg-gray-300 rounded-md"></div>
|
||||
<div className="animate-pulse bg-gray-100 p-6 transition-opacity duration-500" style={{ animation: 'pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite' }}>
|
||||
<div className="h-4 bg-gray-300 rounded-md mb-2"/>
|
||||
<div className="h-4 bg-gray-300 rounded-md mb-2"/>
|
||||
<div className="h-4 bg-gray-300 rounded-md"/>
|
||||
</div>
|
||||
) : (
|
||||
isSummaryVisible && summary && <CvSummaryPanel summary={summary} />
|
||||
summary && <CvSummaryPanel summary={summary} />
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
<footer className=" flex flex-col items-center justify-center mt-16 p-4 border-t border-gray-200">
|
||||
<footer className="flex flex-col items-center justify-center mt-16 p-4 border-t border-gray-200 absolute bottom-0 left-0 right-0 w-full">
|
||||
<p className="text-center text-gray-500 text-sm mb-4">
|
||||
This tool is inspired by and uses data from websites like{" "}
|
||||
</p>
|
||||
|
||||
@ -1,9 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
client = OpenAI()
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
def analyze_resume(text):
|
||||
response = client.chat.completions.create(
|
||||
@ -12,14 +18,43 @@ def analyze_resume(text):
|
||||
"role": "system",
|
||||
"content": "Provide a concise summary of the resume, highlighting key skills and potential areas for improvement, in a few sentences."
|
||||
},
|
||||
{"role": "user", "content": text}]
|
||||
{"role": "user", "content": text}],
|
||||
max_tokens=200 # Add a max_tokens parameter to limit the output length
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
return response
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
parser = argparse.ArgumentParser(description="Analyze resume text using OpenAI.")
|
||||
parser.add_argument("-f", "--file", help="Path to the file containing the resume text.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.file:
|
||||
try:
|
||||
with open(args.file, "r", encoding="latin-1") as f:
|
||||
text_content = f.read()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {args.file}")
|
||||
sys.exit(1)
|
||||
elif len(sys.argv) > 1:
|
||||
text_content = sys.argv[1]
|
||||
summary = analyze_resume(text_content)
|
||||
print(summary)
|
||||
else:
|
||||
print("Please provide text content as a command line argument.")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
response = analyze_resume(text_content)
|
||||
summary = response.choices[0].message.content
|
||||
|
||||
# Print usage information
|
||||
input_tokens = response.usage.prompt_tokens
|
||||
output_tokens = response.usage.completion_tokens
|
||||
total_tokens = response.usage.total_tokens
|
||||
|
||||
print(f"Summary: {summary}")
|
||||
print(f"\n--- Usage Information ---")
|
||||
print(f"Input tokens: {input_tokens}")
|
||||
print(f"Output tokens: {output_tokens}")
|
||||
print(f"Total tokens: {total_tokens}")
|
||||
print(f"Cost: ${total_tokens * 0.000001:.6f}") # rough estimate
|
||||
|
||||
print("\n--- Summary from OpenAI ---")
|
||||
print(f"Total tokens used: {total_tokens}")
|
||||
|
||||
12
package-lock.json
generated
12
package-lock.json
generated
@ -7,6 +7,7 @@
|
||||
"dependencies": {
|
||||
"@ai-sdk/google": "^1.1.17",
|
||||
"ai": "^4.1.46",
|
||||
"escape-string-regexp": "^5.0.0",
|
||||
"zod": "^3.24.2"
|
||||
}
|
||||
},
|
||||
@ -170,6 +171,17 @@
|
||||
"resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz",
|
||||
"integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw=="
|
||||
},
|
||||
"node_modules/escape-string-regexp": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
|
||||
"integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/eventsource-parser": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.0.tgz",
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
"dependencies": {
|
||||
"@ai-sdk/google": "^1.1.17",
|
||||
"ai": "^4.1.46",
|
||||
"escape-string-regexp": "^5.0.0",
|
||||
"zod": "^3.24.2"
|
||||
}
|
||||
}
|
||||
|
||||
@ -1 +0,0 @@
|
||||
Subproject commit daaad3bc174252c33eb6d185f8be21fe253cf887
|
||||
Loading…
x
Reference in New Issue
Block a user