Spaces:

ahmadfareedsukhera
/

Cosine-Similarity

Sleeping

App Files Files Community

Cosine-Similarity / app.py

ahmadfareedsukhera

Update app.py

6b9950a verified 3 months ago

raw

history blame contribute delete

4.88 kB

	import os
	import PyPDF2
	from transformers import BertTokenizer, BertModel
	from transformers import LongformerModel, LongformerTokenizer
	from transformers import BigBirdModel, BigBirdTokenizer
	import numpy as np
	import gradio as gr
	from pathlib import Path
	import torch


	# Load BERT tokenizer and model
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	model = BertModel.from_pretrained('bert-base-uncased')

	# Load the BigBird model and tokenizer
	tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
	model = BigBirdModel.from_pretrained('google/bigbird-roberta-base')


	#longformer
	# Load the Longformer model and tokenizer
	tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
	model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

	#longFormer

	def get_longformer_embedding(text):
	# Tokenize the text
	inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)

	# Get the embeddings from Longformer
	with torch.no_grad():
	outputs = model(**inputs)

	# Use the [CLS] token's embedding as the aggregate representation
	cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()

	return cls_embedding

	# BIGBIRD
	def get_bigbird_embedding(text):
	# Tokenize the text
	inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)

	# Get the embeddings from BigBird
	with torch.no_grad():
	outputs = model(**inputs)

	# Use the [CLS] token's embedding as the aggregate representation
	cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()

	return cls_embedding

	def get_bert_embedding(text):
	# Tokenize the text
	inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

	# Get the embeddings from BERT
	with torch.no_grad():
	outputs = model(**inputs)

	# Use the [CLS] token's embedding as the aggregate representation
	cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()

	return cls_embedding
	def process_folder(file):
	folder_path = os.path.dirname(file.name) # Get the directory of the selected file
	files = os.listdir(folder_path) # List all files in the directory
	file_paths = [os.path.join(folder_path, f) for f in files] # Get full paths of all files
	return f"Files in folder: {', '.join(files)}"

	# Function to extract text from a PDF
	def extract_text_from_pdf(pdf_file):
	text = ''
	with open(pdf_file, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	for page in reader.pages:
	text += page.extract_text() or ''
	return text

	def calculate_cosine(embedding1, embedding2):
	# Calculate the dot product and magnitudes of the embeddings
	dot_product = np.dot(embedding1, embedding2)
	magnitude1 = np.linalg.norm(embedding1)
	magnitude2 = np.linalg.norm(embedding2)

	# Calculate cosine similarity
	similarity = dot_product / (magnitude1 * magnitude2)
	return similarity
	def foo(files, JD):
	# Extract text and compute embeddings for job description using different models
	text_jd = extract_text_from_pdf(JD)
	JD_embedding_bert = get_bert_embedding(text_jd).flatten() # Flatten to match the dimension
	JD_embedding_longformer = get_longformer_embedding(text_jd).flatten()
	JD_embedding_bigbird = get_bigbird_embedding(text_jd).flatten()

	sim = []

	for d in files:
	text = extract_text_from_pdf(d)
	# Compute embeddings for the resume using different models
	resume_embedding_bert = get_bert_embedding(text).flatten() # Fixed function call
	resume_embedding_longformer = get_longformer_embedding(text).flatten()
	resume_embedding_bigbird = get_bigbird_embedding(text).flatten()
	# Calculate cosine similarity for each model
	similarity_bert = calculate_cosine(resume_embedding_bert, JD_embedding_bert)
	similarity_longformer = calculate_cosine(resume_embedding_longformer, JD_embedding_longformer)
	similarity_bigbird = calculate_cosine(resume_embedding_bigbird, JD_embedding_bigbird)
	# Append the results to the array
	sim.append(f"\nFile: {d.name:}\n"
	f"Bert Similarity: {similarity_bert:.4f}\n"
	f"Longformer Similarity: {similarity_longformer:.4f}\n"
	f"BigBird Similarity: {similarity_bigbird:.4f}\n")



	return "\n".join(sim) # Join the list into a single string for Gradio output


	with gr.Blocks() as func:
	inputs = [gr.File(file_count="multiple", label="Upload Resume Files"), gr.File(label="Upload Job Description")]
	outputs = gr.Textbox(label="Similarity Scores")
	show = gr.Button(value="Calculate Similarity")
	show.click(foo, inputs, outputs)

	func.launch()