Spaces:

Heraali
/

OCN_CSChatbot

Running

App Files Files Community

Heraali commited on 4 days ago

Commit

48a63b6

•

1 Parent(s): e3c75e5

Upload 2 files

Browse files

Files changed (2) hide show

app.py +62 -69
requirements.txt +3 -6

app.py CHANGED Viewed

@@ -3,47 +3,47 @@ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import json
-from fuzzywuzzy import fuzz
-# Load pre-trained BERT QA model and tokenizer from Hugging Face model hub
-model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
 model = AutoModelForQuestionAnswering.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Dynamically handle device (CPU only)
-device = -1  # Force CPU usage by setting device to -1
-# Initialize the QA pipeline with the correct device
 qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
-# Load the knowledge base from JSON file
 with open('knowledge_base.json', 'r') as f:
     knowledge_base = json.load(f)
-# Load the expanded QA dataset
 with open('expanded_qa_dataset.json', 'r') as f:
     expanded_qa_dataset = json.load(f)
-# Load Sentence-BERT model for semantic search
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Function to create embeddings for the knowledge base content
 def create_knowledge_base_embeddings(knowledge_base):
     embeddings = []
     for entry in knowledge_base:
         if 'title' in entry:
-            content = entry['title'] + ' ' + ' '.join(
                 [c.get('text', '') for c in entry.get('content', [])] +
-                [
-                    ' '.join(step['details']) if isinstance(step['details'], list) else step['details']
-                    for c in entry.get('content', []) if 'steps' in c
-                    for step in c['steps']
-                ] +
-                [
-                    faq['question'] + ' ' + faq['answer']
-                    for c in entry.get('content', []) if 'faq' in c
-                    for faq in c['faq']
-                ]
             )
             embeddings.append(embedding_model.encode(content, convert_to_tensor=True))
     return embeddings
@@ -51,68 +51,61 @@ def create_knowledge_base_embeddings(knowledge_base):
 # Create knowledge base embeddings
 knowledge_base_embeddings = create_knowledge_base_embeddings(knowledge_base)
-# Function to retrieve the best context using semantic similarity
-def get_dynamic_context_semantic(question, knowledge_base, knowledge_base_embeddings, threshold=0.5):
-    # Create embedding for the question
     question_embedding = embedding_model.encode(question, convert_to_tensor=True)
-    # Calculate cosine similarity between the question and knowledge base entries
     cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(knowledge_base_embeddings))
-    # Get the index of the highest score (most similar context)
     best_match_idx = torch.argmax(cosine_scores).item()
     best_match_score = cosine_scores[0, best_match_idx].item()
-    if best_match_score > threshold:  # Set a threshold for semantic similarity
-        best_match_entry = knowledge_base[best_match_idx]
-        # Check if FAQ section exists and prioritize FAQ answers
-        for content_item in best_match_entry['content']:
-            if 'faq' in content_item:
-                for faq in content_item['faq']:
-                    if fuzz.token_sort_ratio(faq['question'].lower(), question.lower()) > 80:
-                        return faq['answer']
-        # If no FAQ is found, check for steps
-        for content_item in best_match_entry['content']:
-            if 'steps' in content_item:
-                step_details = [step['details'] for step in content_item['steps']]
-                return "\n".join(step_details)
-        # Fallback to regular text
-        for content_item in best_match_entry['content']:
-            if 'text' in content_item:
-                return content_item['text']
-    return "Lo siento, no encontré una respuesta adecuada a tu pregunta."
-# Use fuzzy matching to find the closest match in the expanded QA dataset
-def get_answer_from_expanded_qa(question, expanded_qa_dataset, threshold=80):
-    for item in expanded_qa_dataset:
-        # Use fuzzy matching to find close matches
-        if fuzz.token_sort_ratio(item['question'].lower(), question.lower()) > threshold:
-            return item['answer']
-    return None
-# Answer function for the Gradio app
 def answer_question(question):
-    # Check if the question matches any entry in the expanded QA dataset
-    direct_answer = get_answer_from_expanded_qa(question, expanded_qa_dataset)
-    if direct_answer:
-        return direct_answer
-    # If no direct answer found, use the knowledge base with semantic search
-    context = get_dynamic_context_semantic(question, knowledge_base, knowledge_base_embeddings, threshold=0.45)
-    return context
-# Gradio interface setup
 interface = gr.Interface(
     fn=answer_question,
     inputs="text",
     outputs="text",
     title="OCN Customer Support Chatbot",
-    description="Ask questions and get answers from the OCN knowledge base."
 )
-# Launch the Gradio interface
 interface.launch(share=True)

 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import json
+# Load the lightweight BERT-based QA model optimized for CPU
+model_name = "distilbert-base-uncased-distilled-squad"  # Efficient for CPU
 model = AutoModelForQuestionAnswering.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Initialize pipeline for CPU usage
+device = -1  # Force CPU
 qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
+# Load Sentence-BERT for semantic search
+embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# Load knowledge base and expanded QA dataset
 with open('knowledge_base.json', 'r') as f:
     knowledge_base = json.load(f)
 with open('expanded_qa_dataset.json', 'r') as f:
     expanded_qa_dataset = json.load(f)
+# Function to create embeddings for the expanded QA dataset
+def create_qa_dataset_embeddings(expanded_qa_dataset):
+    qa_embeddings = []
+    questions = []
+    for item in expanded_qa_dataset:
+        questions.append(item['question'])
+        qa_embeddings.append(embedding_model.encode(item['question'], convert_to_tensor=True))
+    return qa_embeddings, questions
+# Create QA dataset embeddings
+qa_embeddings, qa_questions = create_qa_dataset_embeddings(expanded_qa_dataset)
 # Function to create embeddings for the knowledge base content
 def create_knowledge_base_embeddings(knowledge_base):
     embeddings = []
     for entry in knowledge_base:
         if 'title' in entry:
+            content = entry['title'] + ' '.join(
                 [c.get('text', '') for c in entry.get('content', [])] +
+                [' '.join(step['details']) for c in entry.get('content', []) if 'steps' in c for step in c['steps']] +
+                [faq['question'] + ' ' + faq['answer'] for c in entry.get('content', []) if 'faq' in c for faq in c['faq']]
             )
             embeddings.append(embedding_model.encode(content, convert_to_tensor=True))
     return embeddings
 # Create knowledge base embeddings
 knowledge_base_embeddings = create_knowledge_base_embeddings(knowledge_base)
+# Semantic search on expanded QA dataset
+def search_expanded_qa(question):
     question_embedding = embedding_model.encode(question, convert_to_tensor=True)
+    cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(qa_embeddings))
+    best_match_idx = torch.argmax(cosine_scores).item()
+    best_match_score = cosine_scores[0, best_match_idx].item()
+    return expanded_qa_dataset[best_match_idx]['answer'], best_match_score
+# Semantic search on knowledge base
+def search_knowledge_base(question):
+    question_embedding = embedding_model.encode(question, convert_to_tensor=True)
     cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(knowledge_base_embeddings))
     best_match_idx = torch.argmax(cosine_scores).item()
     best_match_score = cosine_scores[0, best_match_idx].item()
+    # Retrieve content from best matched knowledge base entry
+    best_match_entry = knowledge_base[best_match_idx]
+    for content_item in best_match_entry['content']:
+        if 'faq' in content_item:
+            for faq in content_item['faq']:
+                if faq['question'].lower() in question.lower():
+                    return faq['answer'], best_match_score
+        if 'steps' in content_item:
+            step_details = [step['details'] for step in content_item['steps']]
+            return "\n".join(step_details), best_match_score
+        if 'text' in content_item:
+            return content_item['text'], best_match_score
+    return "Lo siento, no encontré una respuesta adecuada para tu pregunta.", best_match_score
+# Answer function: search both datasets and return the best match
 def answer_question(question):
+    # Search expanded QA dataset
+    qa_answer, qa_score = search_expanded_qa(question)
+    # Search knowledge base
+    kb_answer, kb_score = search_knowledge_base(question)
+    # Compare scores and return the best answer
+    if qa_score >= kb_score:
+        return qa_answer
+    else:
+        return kb_answer
+# Gradio interface
 interface = gr.Interface(
     fn=answer_question,
     inputs="text",
     outputs="text",
     title="OCN Customer Support Chatbot",
+    description="Ask questions and get answers from the OCN knowledge base and expanded QA dataset."
 )
+# Launch the interface
 interface.launch(share=True)

requirements.txt CHANGED Viewed

@@ -1,7 +1,4 @@
-transformers==4.30.2
-torch==2.0.1
 sentence-transformers==2.2.2
-fuzzywuzzy==0.18.0
-scikit-learn==1.3.0
-gradio==3.16.2
-numpy==1.24.3

+transformers==4.26.1
+torch==1.13.1
 sentence-transformers==2.2.2
+gradio==3.8.2