Spaces:
Running
Running
Upload 2 files
Browse files- app.py +62 -69
- requirements.txt +3 -6
app.py
CHANGED
@@ -3,47 +3,47 @@ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
|
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
import json
|
6 |
-
from fuzzywuzzy import fuzz
|
7 |
|
8 |
-
# Load
|
9 |
-
model_name = "
|
10 |
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
|
13 |
-
#
|
14 |
-
device = -1 # Force CPU
|
15 |
-
|
16 |
-
# Initialize the QA pipeline with the correct device
|
17 |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
|
18 |
|
19 |
-
# Load
|
|
|
|
|
|
|
20 |
with open('knowledge_base.json', 'r') as f:
|
21 |
knowledge_base = json.load(f)
|
22 |
|
23 |
-
# Load the expanded QA dataset
|
24 |
with open('expanded_qa_dataset.json', 'r') as f:
|
25 |
expanded_qa_dataset = json.load(f)
|
26 |
|
27 |
-
#
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Function to create embeddings for the knowledge base content
|
31 |
def create_knowledge_base_embeddings(knowledge_base):
|
32 |
embeddings = []
|
33 |
for entry in knowledge_base:
|
34 |
if 'title' in entry:
|
35 |
-
content = entry['title'] + ' '
|
36 |
[c.get('text', '') for c in entry.get('content', [])] +
|
37 |
-
[
|
38 |
-
|
39 |
-
for c in entry.get('content', []) if 'steps' in c
|
40 |
-
for step in c['steps']
|
41 |
-
] +
|
42 |
-
[
|
43 |
-
faq['question'] + ' ' + faq['answer']
|
44 |
-
for c in entry.get('content', []) if 'faq' in c
|
45 |
-
for faq in c['faq']
|
46 |
-
]
|
47 |
)
|
48 |
embeddings.append(embedding_model.encode(content, convert_to_tensor=True))
|
49 |
return embeddings
|
@@ -51,68 +51,61 @@ def create_knowledge_base_embeddings(knowledge_base):
|
|
51 |
# Create knowledge base embeddings
|
52 |
knowledge_base_embeddings = create_knowledge_base_embeddings(knowledge_base)
|
53 |
|
54 |
-
#
|
55 |
-
def
|
56 |
-
# Create embedding for the question
|
57 |
question_embedding = embedding_model.encode(question, convert_to_tensor=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
|
|
|
|
60 |
cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(knowledge_base_embeddings))
|
61 |
|
62 |
-
# Get the index of the highest score (most similar context)
|
63 |
best_match_idx = torch.argmax(cosine_scores).item()
|
64 |
best_match_score = cosine_scores[0, best_match_idx].item()
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
# Fallback to regular text
|
83 |
-
for content_item in best_match_entry['content']:
|
84 |
-
if 'text' in content_item:
|
85 |
-
return content_item['text']
|
86 |
-
|
87 |
-
return "Lo siento, no encontré una respuesta adecuada a tu pregunta."
|
88 |
-
|
89 |
-
# Use fuzzy matching to find the closest match in the expanded QA dataset
|
90 |
-
def get_answer_from_expanded_qa(question, expanded_qa_dataset, threshold=80):
|
91 |
-
for item in expanded_qa_dataset:
|
92 |
-
# Use fuzzy matching to find close matches
|
93 |
-
if fuzz.token_sort_ratio(item['question'].lower(), question.lower()) > threshold:
|
94 |
-
return item['answer']
|
95 |
-
return None
|
96 |
-
|
97 |
-
# Answer function for the Gradio app
|
98 |
def answer_question(question):
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
103 |
|
104 |
-
#
|
105 |
-
|
106 |
-
|
|
|
|
|
107 |
|
108 |
-
# Gradio interface
|
109 |
interface = gr.Interface(
|
110 |
fn=answer_question,
|
111 |
inputs="text",
|
112 |
outputs="text",
|
113 |
title="OCN Customer Support Chatbot",
|
114 |
-
description="Ask questions and get answers from the OCN knowledge base."
|
115 |
)
|
116 |
|
117 |
-
# Launch the
|
118 |
interface.launch(share=True)
|
|
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
import json
|
|
|
6 |
|
7 |
+
# Load the lightweight BERT-based QA model optimized for CPU
|
8 |
+
model_name = "distilbert-base-uncased-distilled-squad" # Efficient for CPU
|
9 |
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
|
12 |
+
# Initialize pipeline for CPU usage
|
13 |
+
device = -1 # Force CPU
|
|
|
|
|
14 |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
|
15 |
|
16 |
+
# Load Sentence-BERT for semantic search
|
17 |
+
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
18 |
+
|
19 |
+
# Load knowledge base and expanded QA dataset
|
20 |
with open('knowledge_base.json', 'r') as f:
|
21 |
knowledge_base = json.load(f)
|
22 |
|
|
|
23 |
with open('expanded_qa_dataset.json', 'r') as f:
|
24 |
expanded_qa_dataset = json.load(f)
|
25 |
|
26 |
+
# Function to create embeddings for the expanded QA dataset
|
27 |
+
def create_qa_dataset_embeddings(expanded_qa_dataset):
|
28 |
+
qa_embeddings = []
|
29 |
+
questions = []
|
30 |
+
for item in expanded_qa_dataset:
|
31 |
+
questions.append(item['question'])
|
32 |
+
qa_embeddings.append(embedding_model.encode(item['question'], convert_to_tensor=True))
|
33 |
+
return qa_embeddings, questions
|
34 |
+
|
35 |
+
# Create QA dataset embeddings
|
36 |
+
qa_embeddings, qa_questions = create_qa_dataset_embeddings(expanded_qa_dataset)
|
37 |
|
38 |
# Function to create embeddings for the knowledge base content
|
39 |
def create_knowledge_base_embeddings(knowledge_base):
|
40 |
embeddings = []
|
41 |
for entry in knowledge_base:
|
42 |
if 'title' in entry:
|
43 |
+
content = entry['title'] + ' '.join(
|
44 |
[c.get('text', '') for c in entry.get('content', [])] +
|
45 |
+
[' '.join(step['details']) for c in entry.get('content', []) if 'steps' in c for step in c['steps']] +
|
46 |
+
[faq['question'] + ' ' + faq['answer'] for c in entry.get('content', []) if 'faq' in c for faq in c['faq']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
)
|
48 |
embeddings.append(embedding_model.encode(content, convert_to_tensor=True))
|
49 |
return embeddings
|
|
|
51 |
# Create knowledge base embeddings
|
52 |
knowledge_base_embeddings = create_knowledge_base_embeddings(knowledge_base)
|
53 |
|
54 |
+
# Semantic search on expanded QA dataset
|
55 |
+
def search_expanded_qa(question):
|
|
|
56 |
question_embedding = embedding_model.encode(question, convert_to_tensor=True)
|
57 |
+
cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(qa_embeddings))
|
58 |
+
|
59 |
+
best_match_idx = torch.argmax(cosine_scores).item()
|
60 |
+
best_match_score = cosine_scores[0, best_match_idx].item()
|
61 |
+
|
62 |
+
return expanded_qa_dataset[best_match_idx]['answer'], best_match_score
|
63 |
|
64 |
+
# Semantic search on knowledge base
|
65 |
+
def search_knowledge_base(question):
|
66 |
+
question_embedding = embedding_model.encode(question, convert_to_tensor=True)
|
67 |
cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(knowledge_base_embeddings))
|
68 |
|
|
|
69 |
best_match_idx = torch.argmax(cosine_scores).item()
|
70 |
best_match_score = cosine_scores[0, best_match_idx].item()
|
71 |
|
72 |
+
# Retrieve content from best matched knowledge base entry
|
73 |
+
best_match_entry = knowledge_base[best_match_idx]
|
74 |
+
for content_item in best_match_entry['content']:
|
75 |
+
if 'faq' in content_item:
|
76 |
+
for faq in content_item['faq']:
|
77 |
+
if faq['question'].lower() in question.lower():
|
78 |
+
return faq['answer'], best_match_score
|
79 |
+
if 'steps' in content_item:
|
80 |
+
step_details = [step['details'] for step in content_item['steps']]
|
81 |
+
return "\n".join(step_details), best_match_score
|
82 |
+
if 'text' in content_item:
|
83 |
+
return content_item['text'], best_match_score
|
84 |
+
|
85 |
+
return "Lo siento, no encontré una respuesta adecuada para tu pregunta.", best_match_score
|
86 |
+
|
87 |
+
# Answer function: search both datasets and return the best match
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
def answer_question(question):
|
89 |
+
# Search expanded QA dataset
|
90 |
+
qa_answer, qa_score = search_expanded_qa(question)
|
91 |
+
|
92 |
+
# Search knowledge base
|
93 |
+
kb_answer, kb_score = search_knowledge_base(question)
|
94 |
|
95 |
+
# Compare scores and return the best answer
|
96 |
+
if qa_score >= kb_score:
|
97 |
+
return qa_answer
|
98 |
+
else:
|
99 |
+
return kb_answer
|
100 |
|
101 |
+
# Gradio interface
|
102 |
interface = gr.Interface(
|
103 |
fn=answer_question,
|
104 |
inputs="text",
|
105 |
outputs="text",
|
106 |
title="OCN Customer Support Chatbot",
|
107 |
+
description="Ask questions and get answers from the OCN knowledge base and expanded QA dataset."
|
108 |
)
|
109 |
|
110 |
+
# Launch the interface
|
111 |
interface.launch(share=True)
|
requirements.txt
CHANGED
@@ -1,7 +1,4 @@
|
|
1 |
-
transformers==4.
|
2 |
-
torch==
|
3 |
sentence-transformers==2.2.2
|
4 |
-
|
5 |
-
scikit-learn==1.3.0
|
6 |
-
gradio==3.16.2
|
7 |
-
numpy==1.24.3
|
|
|
1 |
+
transformers==4.26.1
|
2 |
+
torch==1.13.1
|
3 |
sentence-transformers==2.2.2
|
4 |
+
gradio==3.8.2
|
|
|
|
|
|