|
import os |
|
import PyPDF2 |
|
from transformers import BertTokenizer, BertModel |
|
from transformers import LongformerModel, LongformerTokenizer |
|
from transformers import BigBirdModel, BigBirdTokenizer |
|
import numpy as np |
|
import gradio as gr |
|
from pathlib import Path |
|
import torch |
|
|
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
|
model = BertModel.from_pretrained('bert-base-uncased') |
|
|
|
|
|
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base') |
|
model = BigBirdModel.from_pretrained('google/bigbird-roberta-base') |
|
|
|
|
|
|
|
|
|
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') |
|
model = LongformerModel.from_pretrained('allenai/longformer-base-4096') |
|
|
|
|
|
|
|
def get_longformer_embedding(text): |
|
|
|
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy() |
|
|
|
return cls_embedding |
|
|
|
|
|
def get_bigbird_embedding(text): |
|
|
|
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy() |
|
|
|
return cls_embedding |
|
|
|
def get_bert_embedding(text): |
|
|
|
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy() |
|
|
|
return cls_embedding |
|
def process_folder(file): |
|
folder_path = os.path.dirname(file.name) |
|
files = os.listdir(folder_path) |
|
file_paths = [os.path.join(folder_path, f) for f in files] |
|
return f"Files in folder: {', '.join(files)}" |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
text = '' |
|
with open(pdf_file, 'rb') as file: |
|
reader = PyPDF2.PdfReader(file) |
|
for page in reader.pages: |
|
text += page.extract_text() or '' |
|
return text |
|
|
|
def calculate_cosine(embedding1, embedding2): |
|
|
|
dot_product = np.dot(embedding1, embedding2) |
|
magnitude1 = np.linalg.norm(embedding1) |
|
magnitude2 = np.linalg.norm(embedding2) |
|
|
|
|
|
similarity = dot_product / (magnitude1 * magnitude2) |
|
return similarity |
|
def foo(files, JD): |
|
|
|
text_jd = extract_text_from_pdf(JD) |
|
JD_embedding_bert = get_bert_embedding(text_jd).flatten() |
|
JD_embedding_longformer = get_longformer_embedding(text_jd).flatten() |
|
JD_embedding_bigbird = get_bigbird_embedding(text_jd).flatten() |
|
|
|
sim = [] |
|
|
|
for d in files: |
|
text = extract_text_from_pdf(d) |
|
|
|
resume_embedding_bert = get_bert_embedding(text).flatten() |
|
resume_embedding_longformer = get_longformer_embedding(text).flatten() |
|
resume_embedding_bigbird = get_bigbird_embedding(text).flatten() |
|
|
|
similarity_bert = calculate_cosine(resume_embedding_bert, JD_embedding_bert) |
|
similarity_longformer = calculate_cosine(resume_embedding_longformer, JD_embedding_longformer) |
|
similarity_bigbird = calculate_cosine(resume_embedding_bigbird, JD_embedding_bigbird) |
|
|
|
sim.append(f"\nFile: {d.name:}\n" |
|
f"Bert Similarity: {similarity_bert:.4f}\n" |
|
f"Longformer Similarity: {similarity_longformer:.4f}\n" |
|
f"BigBird Similarity: {similarity_bigbird:.4f}\n") |
|
|
|
|
|
|
|
return "\n".join(sim) |
|
|
|
|
|
with gr.Blocks() as func: |
|
inputs = [gr.File(file_count="multiple", label="Upload Resume Files"), gr.File(label="Upload Job Description")] |
|
outputs = gr.Textbox(label="Similarity Scores") |
|
show = gr.Button(value="Calculate Similarity") |
|
show.click(foo, inputs, outputs) |
|
|
|
func.launch() |