nickmuchi's picture
Update functions.py
f8e57a3 verified
import whisper
import os
import random
import openai
from openai import OpenAI
import yt_dlp
from pytube import YouTube, extract
import pandas as pd
import plotly_express as px
import nltk
import plotly.graph_objects as go
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import streamlit as st
import en_core_web_lg
import validators
import re
import itertools
import numpy as np
from bs4 import BeautifulSoup
import base64, time
from annotated_text import annotated_text
import pickle, math
import torch
from pydub import AudioSegment
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.callbacks import StreamlitCallbackHandler
from langchain.agents import OpenAIFunctionsAgent, AgentExecutor
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.agents.openai_functions_agent.agent_token_buffer_memory import (
AgentTokenBufferMemory,
)
from langchain.prompts import MessagesPlaceholder
from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import (
AIMessage,
HumanMessage,
SystemMessage
)
from langchain.prompts import PromptTemplate
from langsmith import Client
client = Client()
openai_audio = OpenAI()
nltk.download('punkt')
from nltk import sent_tokenize
OPEN_AI_KEY = os.environ.get('OPEN_AI_KEY')
time_str = time.strftime("%d%m%Y-%H%M%S")
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
margin-bottom: 2.5rem">{}</div> """
###################### Functions #######################################################################################
#load all required models and cache
@st.cache_resource
def load_models():
'''Load and cache all the models to be used'''
q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
sum_pipe = pipeline("summarization",model="philschmid/flan-t5-base-samsum",clean_up_tokenization_spaces=True)
ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2') #cross-encoder/ms-marco-MiniLM-L-12-v2
sbert = SentenceTransformer('all-MiniLM-L6-v2')
return sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert
@st.cache_data
def load_asr_model(model_name):
'''Load the open source whisper model in cases where the API is not working'''
model = whisper.load_model(model_name)
return model
@st.cache_resource
def get_spacy():
nlp = en_core_web_lg.load()
return nlp
nlp = get_spacy()
sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert = load_models()
@st.cache_data
def get_yt_audio(url):
'''Get YT video from given URL link'''
yt = YouTube(url)
title = yt.title
# Get the first available audio stream and download it
audio_stream = yt.streams.filter(only_audio=True).first().download()
return audio_stream, title
@st.cache_data
def get_yt_audio_dl(url):
'''Back up for when pytube is down'''
temp_audio_file = os.path.join('output', 'audio')
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': temp_audio_file,
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
title = info.get('title', None)
ydl.download([url])
#with open(temp_audio_file+'.mp3', 'rb') as file:
audio_file = os.path.join('output', 'audio.mp3')
return audio_file, title
@st.cache_data
def load_whisper_api(audio):
'''Transcribe YT audio to text using Open AI API'''
file = open(audio, "rb")
transcript = openai_audio.audio.transcriptions.create(model="whisper-1", file=file,response_format="text")
return transcript
@st.cache_data
def transcribe_yt_video(link, py_tube=True):
'''Transcribe YouTube video'''
if py_tube:
audio_file, title = get_yt_audio(link)
print(f'audio_file:{audio_file}')
st.session_state['audio'] = audio_file
print(f"audio_file_session_state:{st.session_state['audio'] }")
#Get size of audio file
audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)
#Check if file is > 24mb, if not then use Whisper API
if audio_size <= 25:
st.info("`Transcribing YT audio...`")
#Use whisper API
results = load_whisper_api(st.session_state['audio'])
else:
st.warning('File size larger than 24mb, applying chunking and transcription',icon="⚠️")
song = AudioSegment.from_file(st.session_state['audio'], format='mp3')
# PyDub handles time in milliseconds
twenty_minutes = 20 * 60 * 1000
chunks = song[::twenty_minutes]
transcriptions = []
video_id = extract.video_id(link)
print(video_id)
for i, chunk in enumerate(chunks):
chunk.export(f'output/chunk_{i}_{video_id}.mp4', format='mp3')
transcriptions.append(load_whisper_api(f'output/chunk_{i}_{video_id}.mp3'))
results = ','.join(transcriptions)
print(results)
else:
audio_file, title = get_yt_audio_dl(link)
print(f'audio_file:{audio_file}')
st.session_state['audio'] = audio_file
print(f"audio_file_session_state:{st.session_state['audio'] }")
#Get size of audio file
audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)
#Check if file is > 24mb, if not then use Whisper API
if audio_size <= 25:
st.info("`Transcribing YT audio...`")
#Use whisper API
results = load_whisper_api(st.session_state['audio'])
else:
st.warning('File size larger than 24mb, applying chunking and transcription',icon="⚠️")
song = AudioSegment.from_file(st.session_state['audio'], format='mp3')
# PyDub handles time in milliseconds
twenty_minutes = 20 * 60 * 1000
chunks = song[::twenty_minutes]
transcriptions = []
video_id = extract.video_id(link)
for i, chunk in enumerate(chunks):
chunk.export(f'output/chunk_{i}_{video_id}.mp3', format='mp3')
transcriptions.append(load_whisper_api(f'output/chunk_{i}_{video_id}.mp3'))
results = ','.join(transcriptions)
st.info("`YT Video transcription process complete...`")
return results, title
@st.cache_data
def inference(link, upload):
'''Convert Youtube video or Audio upload to text'''
try:
if validators.url(link):
st.info("`Downloading YT audio...`")
results, title = transcribe_yt_video(link)
return results, title
elif _upload:
#Get size of audio file
audio_size = round(os.path.getsize(_upload)/(1024*1024),1)
#Check if file is > 24mb, if not then use Whisper API
if audio_size <= 25:
st.info("`Transcribing uploaded audio...`")
#Use whisper API
results = load_whisper_api(_upload)['text']
else:
st.write('File size larger than 24mb, applying chunking and transcription')
song = AudioSegment.from_file(_upload)
# PyDub handles time in milliseconds
twenty_minutes = 20 * 60 * 1000
chunks = song[::twenty_minutes]
transcriptions = []
st.info("`Transcribing uploaded audio...`")
for i, chunk in enumerate(chunks):
chunk.export(f'output/chunk_{i}.mp4', format='mp4')
transcriptions.append(load_whisper_api(f'output/chunk_{i}.mp4')['text'])
results = ','.join(transcriptions)
st.info("`Uploaded audio transcription process complete...`")
return results, "Transcribed Earnings Audio"
except Exception as e:
st.error(f'''PyTube Error: {e},
Using yt_dlp module, might take longer than expected''',icon="🚨")
results, title = transcribe_yt_video(link, py_tube=False)
# results = _asr_model.transcribe(st.session_state['audio'], task='transcribe', language='en')
return results, title
@st.cache_resource
def send_feedback(run_id, score):
client.create_feedback(run_id, "user_score", score=score)
@st.cache_data
def clean_text(text):
'''Clean all text after inference'''
text = text.encode("ascii", "ignore").decode() # unicode
text = re.sub(r"https*\S+", " ", text) # url
text = re.sub(r"@\S+", " ", text) # mentions
text = re.sub(r"#\S+", " ", text) # hastags
text = re.sub(r"\s{2,}", " ", text) # over spaces
return text
@st.cache_data
def chunk_long_text(text,threshold,window_size=3,stride=2):
'''Preprocess text and chunk for sentiment analysis'''
#Convert cleaned text into sentences
sentences = sent_tokenize(text)
out = []
#Limit the length of each sentence to a threshold
for chunk in sentences:
if len(chunk.split()) < threshold:
out.append(chunk)
else:
words = chunk.split()
num = int(len(words)/threshold)
for i in range(0,num*threshold+1,threshold):
out.append(' '.join(words[i:threshold+i]))
passages = []
#Combine sentences into a window of size window_size
for paragraph in [out]:
for start_idx in range(0, len(paragraph), stride):
end_idx = min(start_idx+window_size, len(paragraph))
passages.append(" ".join(paragraph[start_idx:end_idx]))
return passages
@st.cache_data
def sentiment_pipe(earnings_text):
'''Determine the sentiment of the text'''
earnings_sentences = chunk_long_text(earnings_text,150,1,1)
earnings_sentiment = sent_pipe(earnings_sentences)
return earnings_sentiment, earnings_sentences
@st.cache_data
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
'''Chunk and preprocess text for summarization'''
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentences = sent_tokenize(text)
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
count += 1
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
chunk += sentence + " " # add the sentence to the chunk
length = combined_length # update the length counter
# if it is the last sentence
if count == len(sentences) - 1:
chunks.append(chunk) # save the chunk
else:
chunks.append(chunk) # save the chunk
# reset
length = 0
chunk = ""
# take care of the overflow sentence
chunk += sentence + " "
length = len(tokenizer.tokenize(sentence))
return chunks
@st.cache_data
def summarize_text(text_to_summarize,max_len,min_len):
'''Summarize text with HF model'''
summarized_text = sum_pipe(text_to_summarize,
max_length=max_len,
min_length=min_len,
do_sample=False,
early_stopping=True,
num_beams=4)
summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
return summarized_text
@st.cache_data
def get_all_entities_per_sentence(text):
doc = nlp(''.join(text))
sentences = list(doc.sents)
entities_all_sentences = []
for sentence in sentences:
entities_this_sentence = []
# SPACY ENTITIES
for entity in sentence.ents:
entities_this_sentence.append(str(entity))
# XLM ENTITIES
entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
for entity in entities_xlm:
entities_this_sentence.append(str(entity))
entities_all_sentences.append(entities_this_sentence)
return entities_all_sentences
@st.cache_data
def get_all_entities(text):
all_entities_per_sentence = get_all_entities_per_sentence(text)
return list(itertools.chain.from_iterable(all_entities_per_sentence))
@st.cache_data
def get_and_compare_entities(article_content,summary_output):
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
matched_entities = []
unmatched_entities = []
for entity in entities_summary:
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
matched_entities.append(entity)
elif any(
np.inner(sbert.encode(entity, show_progress_bar=False),
sbert.encode(art_entity, show_progress_bar=False)) > 0.9 for
art_entity in entities_article):
matched_entities.append(entity)
else:
unmatched_entities.append(entity)
matched_entities = list(dict.fromkeys(matched_entities))
unmatched_entities = list(dict.fromkeys(unmatched_entities))
matched_entities_to_remove = []
unmatched_entities_to_remove = []
for entity in matched_entities:
for substring_entity in matched_entities:
if entity != substring_entity and entity.lower() in substring_entity.lower():
matched_entities_to_remove.append(entity)
for entity in unmatched_entities:
for substring_entity in unmatched_entities:
if entity != substring_entity and entity.lower() in substring_entity.lower():
unmatched_entities_to_remove.append(entity)
matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))
for entity in matched_entities_to_remove:
matched_entities.remove(entity)
for entity in unmatched_entities_to_remove:
unmatched_entities.remove(entity)
return matched_entities, unmatched_entities
@st.cache_data
def highlight_entities(article_content,summary_output):
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
markdown_end = "</mark>"
matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
for entity in matched_entities:
summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
for entity in unmatched_entities:
summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
print("")
print("")
soup = BeautifulSoup(summary_output, features="html.parser")
return HTML_WRAPPER.format(soup)
def summary_downloader(raw_text):
'''Download the summary generated'''
b64 = base64.b64encode(raw_text.encode()).decode()
new_filename = "new_text_file_{}_.txt".format(time_str)
st.markdown("#### Download Summary as a File ###")
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
st.markdown(href,unsafe_allow_html=True)
@st.cache_data
def generate_eval(raw_text, N, chunk):
# Generate N questions from context of chunk chars
# IN: text, N questions, chunk size to draw question from in the doc
# OUT: eval set as JSON list
# raw_text = ','.join(raw_text)
update = st.empty()
ques_update = st.empty()
update.info("`Generating sample questions ...`")
n = len(raw_text)
starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
sub_sequences = [raw_text[i:i+chunk] for i in starting_indices]
chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
eval_set = []
for i, b in enumerate(sub_sequences):
try:
qa = chain.run(b)
eval_set.append(qa)
ques_update.info(f"Creating Question: {i+1}")
except Exception as e:
print(e)
st.warning(f'Error in generating Question: {i+1}...', icon="⚠️")
continue
eval_set_full = list(itertools.chain.from_iterable(eval_set))
update.empty()
ques_update.empty()
return eval_set_full
@st.cache_resource
def create_prompt_and_llm():
'''Create prompt'''
llm = ChatOpenAI(temperature=0, streaming=True, model="gpt-4o")
message = SystemMessage(
content=(
"You are a helpful chatbot who is tasked with answering questions acuurately about earnings call transcript provided. "
"Unless otherwise explicitly stated, it is probably fair to assume that questions are about the earnings call transcript. "
"If there is any ambiguity, you probably assume they are about that."
"Do not use any information not provided in the earnings context and remember you are a to speak like a finance expert."
"If you don't know the answer, just say 'There is no relevant answer in the given earnings call transcript'"
"don't try to make up an answer"
)
)
prompt = OpenAIFunctionsAgent.create_prompt(
system_message=message,
extra_prompt_messages=[MessagesPlaceholder(variable_name="history")],
)
return prompt, llm
@st.cache_resource
def gen_embeddings(embedding_model):
'''Generate embeddings for given model'''
if 'hkunlp' in embedding_model:
embeddings = HuggingFaceInstructEmbeddings(model_name=embedding_model,
query_instruction='Represent the Financial question for retrieving supporting paragraphs: ',
embed_instruction='Represent the Financial paragraph for retrieval: ')
elif 'mpnet' in embedding_model:
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
elif 'FlagEmbedding' in embedding_model:
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model,
encode_kwargs = encode_kwargs
)
return embeddings
@st.cache_data
def create_vectorstore(corpus, title, embedding_model, chunk_size=1000, overlap=50):
'''Process text for Semantic Search'''
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=overlap)
texts = text_splitter.split_text(corpus)
embeddings = gen_embeddings(embedding_model)
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])
return vectorstore
@st.cache_resource
def create_memory_and_agent(_docsearch):
'''Embed text and generate semantic search scores'''
#create vectorstore
vectorstore = _docsearch.as_retriever(search_kwargs={"k": 4})
#create retriever tool
tool = create_retriever_tool(
vectorstore,
"earnings_call_search",
"Searches and returns documents using the earnings context provided as a source, relevant to the user input question.",
)
tools = [tool]
prompt,llm = create_prompt_and_llm()
agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)
agent_executor = AgentExecutor(
agent=agent,
tools=tools,
verbose=True,
return_intermediate_steps=True,
)
memory = AgentTokenBufferMemory(llm=llm)
return memory, agent_executor
@st.cache_data
def gen_sentiment(text):
'''Generate sentiment of given text'''
return sent_pipe(text)[0]['label']
@st.cache_data
def gen_annotated_text(df):
'''Generate annotated text'''
tag_list=[]
for row in df.itertuples():
label = row[2]
text = row[1]
if label == 'Positive':
tag_list.append((text,label,'#8fce00'))
elif label == 'Negative':
tag_list.append((text,label,'#f44336'))
else:
tag_list.append((text,label,'#000000'))
return tag_list
def display_df_as_table(model,top_k,score='score'):
'''Display the df with text and scores as a table'''
df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
df['Score'] = round(df['Score'],2)
return df
def make_spans(text,results):
results_list = []
for i in range(len(results)):
results_list.append(results[i]['label'])
facts_spans = []
facts_spans = list(zip(sent_tokenizer(text),results_list))
return facts_spans
##Fiscal Sentiment by Sentence
def fin_ext(text):
results = remote_clx(sent_tokenizer(text))
return make_spans(text,results)
## Knowledge Graphs code
def get_article(url):
article = Article(url)
article.download()
article.parse()
return article