lively06
commit4
b9e94ef
raw
history blame contribute delete
No virus
7.14 kB
import PyPDF2 as pdf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import bert_score
from rouge_score import rouge_scorer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from difflib import SequenceMatcher
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import nltk
nltk.download('vader_lexicon')
st.set_page_config(page_title="Streamlit Sentiment App", page_icon="static/res/favicon.png")
# Initialize the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")
def extract_text(uploaded_file):
text = ""
if uploaded_file:
reader = pdf.PdfReader(uploaded_file)
for page in reader.pages:
text += page.extract_text()
return text
def calculate_similarity(text1, text2):
vectorizer = CountVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()
return cosine_similarity(vectors)[0][1]
def bert_similarity(text1, text2):
P, R, F1 = bert_score.score([text1], [text2], lang="en", verbose=True)
return F1.item()
def rouge_similarity(text1, text2):
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = scorer.score(text1, text2)
return scores['rougeL'].fmeasure
def highlight_similarity(text1, text2):
matcher = SequenceMatcher(None, text1, text2)
matches = matcher.get_matching_blocks()
highlighted_text = ""
for match in matches:
start1 = match.a
end1 = match.a + match.size
start2 = match.b
end2 = match.b + match.size
# Highlight the matching subsequence
highlighted_text += text1[start1:end1] + '\n'
highlighted_text += text2[start2:end2] + '\n\n'
return highlighted_text
def generate_summary(text):
# Encode the text
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1000, truncation=True)
# Generate the summary
outputs = model.generate(inputs, max_length=1000, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)
# Decode the summary
summary = tokenizer.decode(outputs[0])
return summary
def predict_sentiment(text, threshold_positive, threshold_negative):
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
threshold_positive = float(threshold_positive)
threshold_negative = float(threshold_negative)
if sentiment_scores.get("compound", 0) >= threshold_positive:
return "Positive"
elif sentiment_scores.get("compound", 0) <= threshold_negative:
return "Negative"
else:
return "Neutral"
def main():
st.title("Text Analysis App")
st.write("This app checks the similarity between two PDF files using different similarity metrics or generates a summary for a single document or does the sentiment analyis.")
st.write("Upload PDF files, select an option from the dropdown menu, and proceed accordingly.")
option = st.selectbox("Select Option", ["Check Similarity", "Generate Summary", "Sentiment Analysis"])
if option == "Check Similarity":
uploaded_file1 = st.file_uploader("Choose a PDF file 1", type="pdf")
uploaded_file2 = st.file_uploader("Choose a PDF file 2", type="pdf")
st.sidebar.title("Similarity Metrics")
st.sidebar.write("**Cosine Similarity**:")
st.sidebar.write("Measures how similar the two documents are based on their content.")
st.sidebar.write("**BERT Score**:")
st.sidebar.write("Provides a similarity measure based on contextual embeddings of the documents.")
st.sidebar.write("**ROUGE Score**:")
st.sidebar.write("Evaluates the overlap in n-grams between the two documents.")
similarity_metric = st.selectbox("Select Similarity Metric", ["Cosine Similarity", "BERT Score", "ROUGE Score"])
if uploaded_file1 and uploaded_file2:
if st.button("Check Similarity"):
text1 = extract_text(uploaded_file1)
text2 = extract_text(uploaded_file2)
similarity = None
if similarity_metric == "Cosine Similarity":
similarity = calculate_similarity(text1, text2)
st.write(f"The similarity between the two files is {similarity:.2f}.")
elif similarity_metric == "BERT Score":
bert_similarity_score = bert_similarity(text1, text2)
st.write(f"The BERT similarity score between the two files is {bert_similarity_score:.2f}.")
elif similarity_metric == "ROUGE Score":
rouge_similarity_score = rouge_similarity(text1, text2)
st.write(f"The ROUGE similarity score between the two files is {rouge_similarity_score:.2f}.")
st.write("Highlighted Similarity:")
st.write(highlight_similarity(text1, text2))
elif option == "Generate Summary":
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file:
if st.button("Generate Summary"):
text = extract_text(uploaded_file)
summary = generate_summary(text)
st.write("Summary:")
st.write(summary)
elif option == "Sentiment Analysis":
threshold_positive = st.number_input("Threshold for Positive Sentiment:", value=0.05, step=0.01)
threshold_negative = st.number_input("Threshold for Negative Sentiment:", value=-0.05, step=0.01)
uploaded_file = st.file_uploader("Upload PDF Document")
if uploaded_file:
pdf_reader = pdf.PdfReader(uploaded_file)
positive_count = 0
negative_count = 0
neutral_count = 0
for page in pdf_reader.pages:
text = page.extract_text()
sentences = text.split(".")
for sentence in sentences:
sentence = sentence.strip()
if sentence:
sentiment = predict_sentiment(sentence, threshold_positive, threshold_negative)
if sentiment == "Positive":
positive_count += 1
elif sentiment == "Negative":
negative_count += 1
else:
neutral_count += 1
st.write("Positive Sentences:", positive_count)
st.write("Negative Sentences:", negative_count)
st.write("Neutral Sentences:", neutral_count)
labels = ["Positive", "Negative", "Neutral"]
sizes = [positive_count, negative_count, neutral_count]
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
ax.axis("equal")
ax.set_title("Sentiment Distribution")
st.pyplot(fig)
if __name__ == "__main__":
main()