import streamlit as st from sentence_transformers import SentenceTransformer, util from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer from sklearn.manifold import TSNE from langdetect import detect, DetectorFactory import numpy as np import matplotlib.pyplot as plt import pandas as pd import torch st.set_page_config(page_title="Multilingual Text Analysis System", layout="wide") @st.cache_resource def load_model(): return SentenceTransformer('distiluse-base-multilingual-cased-v1') DetectorFactory.seed = 0 multi_embedding_model = load_model() class WordEmbeddingAgent: def __init__(self, model): self.model = model def get_embeddings(self, words): return self.model.encode(words) class SimilarityAgent: def __init__(self, model): self.model = model def compute_similarity(self, text1, text2): embedding1 = self.model.encode(text1, convert_to_tensor=True) embedding2 = self.model.encode(text2, convert_to_tensor=True) return util.pytorch_cos_sim(embedding1, embedding2).item() class TopicModelingAgent: def __init__(self, n_components=5): self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42) def fit_transform(self, texts, lang): stop_words = 'english' if lang == 'en' else None vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words) dtm = vectorizer.fit_transform(texts) self.lda_model.fit(dtm) return self.lda_model.transform(dtm), vectorizer def get_topics(self, vectorizer, num_words=5): topics = {} for idx, topic in enumerate(self.lda_model.components_): topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]] return topics def detect_language(text): try: return detect(text) except: return "unknown" @st.cache_data def tsne_visualization(embeddings, words): tsne = TSNE(n_components=2, random_state=42) embeddings_2d = tsne.fit_transform(embeddings) df = pd.DataFrame(embeddings_2d, columns=['x', 'y']) df['word'] = words return df st.title("Multilingual Text Analysis System") user_input = st.text_area("Enter your text here:") if st.button("Analyze") or user_input: if user_input: lang = detect_language(user_input) st.write(f"Detected language: {lang}") embedding_agent = WordEmbeddingAgent(multi_embedding_model) similarity_agent = SimilarityAgent(multi_embedding_model) topic_modeling_agent = TopicModelingAgent() words = user_input.split() with st.spinner("Generating word embeddings..."): embeddings = embedding_agent.get_embeddings(words) st.success("Word Embeddings Generated.") with st.spinner("Creating t-SNE visualization..."): tsne_df = tsne_visualization(embeddings, words) fig, ax = plt.subplots() ax.scatter(tsne_df['x'], tsne_df['y']) for i, word in enumerate(tsne_df['word']): ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i])) st.pyplot(fig) with st.spinner("Extracting topics..."): texts = [user_input, "Another text to improve topic modeling."] topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang) topics = topic_modeling_agent.get_topics(vectorizer) st.subheader("Topics Extracted:") for topic, words in topics.items(): st.write(f"Topic {topic}: {', '.join(words)}") with st.spinner("Computing similarity..."): text2 = "Otro texto de ejemplo para comparaciĆ³n de similitud." if lang != 'en' else "Another example text for similarity comparison." similarity_score = similarity_agent.compute_similarity(user_input, text2) st.write(f"Similarity Score with example text: {similarity_score:.4f}") else: st.warning("Please enter some text to analyze.") st.sidebar.title("About") st.sidebar.info("This app performs multilingual text analysis using various NLP techniques.")