ahmadfareedsukhera's picture
Update app.py
6b9950a verified
import os
import PyPDF2
from transformers import BertTokenizer, BertModel
from transformers import LongformerModel, LongformerTokenizer
from transformers import BigBirdModel, BigBirdTokenizer
import numpy as np
import gradio as gr
from pathlib import Path
import torch
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Load the BigBird model and tokenizer
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdModel.from_pretrained('google/bigbird-roberta-base')
#longformer
# Load the Longformer model and tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
#longFormer
def get_longformer_embedding(text):
# Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)
# Get the embeddings from Longformer
with torch.no_grad():
outputs = model(**inputs)
# Use the [CLS] token's embedding as the aggregate representation
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
return cls_embedding
# BIGBIRD
def get_bigbird_embedding(text):
# Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)
# Get the embeddings from BigBird
with torch.no_grad():
outputs = model(**inputs)
# Use the [CLS] token's embedding as the aggregate representation
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
return cls_embedding
def get_bert_embedding(text):
# Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
# Get the embeddings from BERT
with torch.no_grad():
outputs = model(**inputs)
# Use the [CLS] token's embedding as the aggregate representation
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
return cls_embedding
def process_folder(file):
folder_path = os.path.dirname(file.name) # Get the directory of the selected file
files = os.listdir(folder_path) # List all files in the directory
file_paths = [os.path.join(folder_path, f) for f in files] # Get full paths of all files
return f"Files in folder: {', '.join(files)}"
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
text = ''
with open(pdf_file, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text() or ''
return text
def calculate_cosine(embedding1, embedding2):
# Calculate the dot product and magnitudes of the embeddings
dot_product = np.dot(embedding1, embedding2)
magnitude1 = np.linalg.norm(embedding1)
magnitude2 = np.linalg.norm(embedding2)
# Calculate cosine similarity
similarity = dot_product / (magnitude1 * magnitude2)
return similarity
def foo(files, JD):
# Extract text and compute embeddings for job description using different models
text_jd = extract_text_from_pdf(JD)
JD_embedding_bert = get_bert_embedding(text_jd).flatten() # Flatten to match the dimension
JD_embedding_longformer = get_longformer_embedding(text_jd).flatten()
JD_embedding_bigbird = get_bigbird_embedding(text_jd).flatten()
sim = []
for d in files:
text = extract_text_from_pdf(d)
# Compute embeddings for the resume using different models
resume_embedding_bert = get_bert_embedding(text).flatten() # Fixed function call
resume_embedding_longformer = get_longformer_embedding(text).flatten()
resume_embedding_bigbird = get_bigbird_embedding(text).flatten()
# Calculate cosine similarity for each model
similarity_bert = calculate_cosine(resume_embedding_bert, JD_embedding_bert)
similarity_longformer = calculate_cosine(resume_embedding_longformer, JD_embedding_longformer)
similarity_bigbird = calculate_cosine(resume_embedding_bigbird, JD_embedding_bigbird)
# Append the results to the array
sim.append(f"\nFile: {d.name:}\n"
f"Bert Similarity: {similarity_bert:.4f}\n"
f"Longformer Similarity: {similarity_longformer:.4f}\n"
f"BigBird Similarity: {similarity_bigbird:.4f}\n")
return "\n".join(sim) # Join the list into a single string for Gradio output
with gr.Blocks() as func:
inputs = [gr.File(file_count="multiple", label="Upload Resume Files"), gr.File(label="Upload Job Description")]
outputs = gr.Textbox(label="Similarity Scores")
show = gr.Button(value="Calculate Similarity")
show.click(foo, inputs, outputs)
func.launch()