Spaces:

LeoWalker
/

ResumeParser

Running

App Files Files Community

ResumeParser / app.py

LeoWalker

app.py: Extended the available models to Llama3, Gemma, and Mistral. Still don't have gemini working.

d5e81f7 7 months ago

raw

history blame

6.07 kB

	from dotenv import load_dotenv
	import io
	import streamlit as st
	import streamlit.components.v1 as components
	import base64

	from langchain.prompts import PromptTemplate
	from langchain_core.output_parsers import PydanticOutputParser
	from langchain_anthropic import ChatAnthropic
	from langchain_openai import ChatOpenAI
	from langchain_groq import ChatGroq
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_core.exceptions import OutputParserException
	from pydantic import ValidationError
	from langchain_core.pydantic_v1 import BaseModel, Field
	from resume_template import Resume
	from json import JSONDecodeError
	import PyPDF2
	import json
	import time
	import os


	# Set the LANGCHAIN_TRACING_V2 environment variable to 'true'
	os.environ['LANGCHAIN_TRACING_V2'] = 'true'

	# Set the LANGCHAIN_PROJECT environment variable to the desired project name
	os.environ['LANGCHAIN_PROJECT'] = 'Resume_Project'

	load_dotenv()

	def pdf_to_string(file):
	"""
	Convert a PDF file to a string.

	Parameters:
	file (io.BytesIO): A file-like object representing the PDF file.

	Returns:
	str: The extracted text from the PDF.
	"""
	pdf_reader = PyPDF2.PdfReader(file)
	num_pages = len(pdf_reader.pages)
	text = ''
	for i in range(num_pages):
	page = pdf_reader.pages[i]
	text += page.extract_text()
	file.close()
	return text


	def extract_resume_fields(full_text, model):
	"""
	Analyze a resume text and extract structured information using a specified language model.
	Parameters:
	full_text (str): The text content of the resume.
	model (str): The language model object to use for processing the text.
	Returns:
	dict: A dictionary containing structured information extracted from the resume.
	"""
	# The Resume object is imported from the local resume_template file

	with open("prompts/resume_extraction.prompt", "r") as f:
	template = f.read()

	parser = PydanticOutputParser(pydantic_object=Resume)

	prompt_template = PromptTemplate(
	template=template,
	input_variables=["resume"],
	partial_variables={"response_template": parser.get_format_instructions()},
	)
	llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))

	chain = prompt_template \| llm \| parser
	max_attempts = 2
	attempt = 1

	while attempt <= max_attempts:
	try:
	output = chain.invoke(full_text)
	print(output)
	return output
	except (OutputParserException, ValidationError) as e:
	if attempt == max_attempts:
	raise e
	else:
	print(f"Parsing error occurred. Retrying (attempt {attempt + 1}/{max_attempts})...")
	attempt += 1

	return None

	def display_extracted_fields(obj, section_title=None, indent=0):
	if section_title:
	st.subheader(section_title)
	for field_name, field_value in obj:
	if field_name in ["personal_details", "education", "work_experience", "projects", "skills", "certifications", "publications", "awards", "additional_sections"]:
	st.write(" " * indent + f"{field_name.replace('_', ' ').title()}:")
	if isinstance(field_value, BaseModel):
	display_extracted_fields(field_value, None, indent + 1)
	elif isinstance(field_value, list):
	for item in field_value:
	if isinstance(item, BaseModel):
	display_extracted_fields(item, None, indent + 1)
	else:
	st.write(" " * (indent + 1) + "- " + str(item))
	else:
	st.write(" " * (indent + 1) + str(field_value))
	else:
	st.write(" " * indent + f"{field_name.replace('_', ' ').title()}: " + str(field_value))

	def get_json_download_link(json_str, download_name):
	# Convert the JSON string back to a dictionary
	data = json.loads(json_str)

	# Convert the dictionary back to a JSON string with 4 spaces indentation
	json_str_formatted = json.dumps(data, indent=4)

	b64 = base64.b64encode(json_str_formatted.encode()).decode()
	href = f'<a href="data:file/json;base64,{b64}" download="{download_name}.json">Click here to download the JSON file</a>'
	return href

	st.set_page_config(layout="wide")

	st.title("Resume Parser")

	llm_dict = {
	"GPT 3.5 turbo": ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
	"Anthropic Sonnet": ChatAnthropic(model_name="claude-3-sonnet-20240229"),
	"Llama 3": ChatGroq(model_name="llama3-70b-8192"),
	"Gemma": ChatGroq(model_name="gemma-7b-it"),
	"Mistral": ChatGroq(model_name="mixtral-8x7b-32768"),
	# "Gemini 1.5 Pro": ChatGoogleGenerativeAI(model_name="gemini-1.5-pro-latest"),
	}



	uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
	col1, col2 = st.columns(2)

	with col1:
	selected_model1 = st.selectbox("Select Model 1", list(llm_dict.keys()), index=list(llm_dict.keys()).index("Llama 3"))

	with col2:
	selected_model2 = st.selectbox("Select Model 2", list(llm_dict.keys()), index=list(llm_dict.keys()).index("Mistral"))

	if uploaded_file is not None:
	text = pdf_to_string(uploaded_file)

	if st.button("Extract Resume Fields"):
	col1, col2 = st.columns(2)

	with col1:
	start_time = time.time()
	extracted_fields1 = extract_resume_fields(text, selected_model1)
	end_time = time.time()
	elapsed_time = end_time - start_time
	st.write(f"Extraction completed in {elapsed_time:.2f} seconds")
	display_extracted_fields(extracted_fields1, "Extracted Resume Fields (Model 1)")

	with col2:
	start_time = time.time()
	extracted_fields2 = extract_resume_fields(text, selected_model2)
	end_time = time.time()
	elapsed_time = end_time - start_time
	st.write(f"Extraction completed in {elapsed_time:.2f} seconds")
	display_extracted_fields(extracted_fields2, "Extracted Resume Fields (Model 2)")