Heraali commited on
Commit
48a63b6
1 Parent(s): e3c75e5

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +62 -69
  2. requirements.txt +3 -6
app.py CHANGED
@@ -3,47 +3,47 @@ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import json
6
- from fuzzywuzzy import fuzz
7
 
8
- # Load pre-trained BERT QA model and tokenizer from Hugging Face model hub
9
- model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
10
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
- # Dynamically handle device (CPU only)
14
- device = -1 # Force CPU usage by setting device to -1
15
-
16
- # Initialize the QA pipeline with the correct device
17
  qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
18
 
19
- # Load the knowledge base from JSON file
 
 
 
20
  with open('knowledge_base.json', 'r') as f:
21
  knowledge_base = json.load(f)
22
 
23
- # Load the expanded QA dataset
24
  with open('expanded_qa_dataset.json', 'r') as f:
25
  expanded_qa_dataset = json.load(f)
26
 
27
- # Load Sentence-BERT model for semantic search
28
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
 
 
 
 
 
 
 
29
 
30
  # Function to create embeddings for the knowledge base content
31
  def create_knowledge_base_embeddings(knowledge_base):
32
  embeddings = []
33
  for entry in knowledge_base:
34
  if 'title' in entry:
35
- content = entry['title'] + ' ' + ' '.join(
36
  [c.get('text', '') for c in entry.get('content', [])] +
37
- [
38
- ' '.join(step['details']) if isinstance(step['details'], list) else step['details']
39
- for c in entry.get('content', []) if 'steps' in c
40
- for step in c['steps']
41
- ] +
42
- [
43
- faq['question'] + ' ' + faq['answer']
44
- for c in entry.get('content', []) if 'faq' in c
45
- for faq in c['faq']
46
- ]
47
  )
48
  embeddings.append(embedding_model.encode(content, convert_to_tensor=True))
49
  return embeddings
@@ -51,68 +51,61 @@ def create_knowledge_base_embeddings(knowledge_base):
51
  # Create knowledge base embeddings
52
  knowledge_base_embeddings = create_knowledge_base_embeddings(knowledge_base)
53
 
54
- # Function to retrieve the best context using semantic similarity
55
- def get_dynamic_context_semantic(question, knowledge_base, knowledge_base_embeddings, threshold=0.5):
56
- # Create embedding for the question
57
  question_embedding = embedding_model.encode(question, convert_to_tensor=True)
 
 
 
 
 
 
58
 
59
- # Calculate cosine similarity between the question and knowledge base entries
 
 
60
  cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(knowledge_base_embeddings))
61
 
62
- # Get the index of the highest score (most similar context)
63
  best_match_idx = torch.argmax(cosine_scores).item()
64
  best_match_score = cosine_scores[0, best_match_idx].item()
65
 
66
- if best_match_score > threshold: # Set a threshold for semantic similarity
67
- best_match_entry = knowledge_base[best_match_idx]
68
-
69
- # Check if FAQ section exists and prioritize FAQ answers
70
- for content_item in best_match_entry['content']:
71
- if 'faq' in content_item:
72
- for faq in content_item['faq']:
73
- if fuzz.token_sort_ratio(faq['question'].lower(), question.lower()) > 80:
74
- return faq['answer']
75
-
76
- # If no FAQ is found, check for steps
77
- for content_item in best_match_entry['content']:
78
- if 'steps' in content_item:
79
- step_details = [step['details'] for step in content_item['steps']]
80
- return "\n".join(step_details)
81
-
82
- # Fallback to regular text
83
- for content_item in best_match_entry['content']:
84
- if 'text' in content_item:
85
- return content_item['text']
86
-
87
- return "Lo siento, no encontré una respuesta adecuada a tu pregunta."
88
-
89
- # Use fuzzy matching to find the closest match in the expanded QA dataset
90
- def get_answer_from_expanded_qa(question, expanded_qa_dataset, threshold=80):
91
- for item in expanded_qa_dataset:
92
- # Use fuzzy matching to find close matches
93
- if fuzz.token_sort_ratio(item['question'].lower(), question.lower()) > threshold:
94
- return item['answer']
95
- return None
96
-
97
- # Answer function for the Gradio app
98
  def answer_question(question):
99
- # Check if the question matches any entry in the expanded QA dataset
100
- direct_answer = get_answer_from_expanded_qa(question, expanded_qa_dataset)
101
- if direct_answer:
102
- return direct_answer
 
103
 
104
- # If no direct answer found, use the knowledge base with semantic search
105
- context = get_dynamic_context_semantic(question, knowledge_base, knowledge_base_embeddings, threshold=0.45)
106
- return context
 
 
107
 
108
- # Gradio interface setup
109
  interface = gr.Interface(
110
  fn=answer_question,
111
  inputs="text",
112
  outputs="text",
113
  title="OCN Customer Support Chatbot",
114
- description="Ask questions and get answers from the OCN knowledge base."
115
  )
116
 
117
- # Launch the Gradio interface
118
  interface.launch(share=True)
 
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import json
 
6
 
7
+ # Load the lightweight BERT-based QA model optimized for CPU
8
+ model_name = "distilbert-base-uncased-distilled-squad" # Efficient for CPU
9
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
 
12
+ # Initialize pipeline for CPU usage
13
+ device = -1 # Force CPU
 
 
14
  qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
15
 
16
+ # Load Sentence-BERT for semantic search
17
+ embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
18
+
19
+ # Load knowledge base and expanded QA dataset
20
  with open('knowledge_base.json', 'r') as f:
21
  knowledge_base = json.load(f)
22
 
 
23
  with open('expanded_qa_dataset.json', 'r') as f:
24
  expanded_qa_dataset = json.load(f)
25
 
26
+ # Function to create embeddings for the expanded QA dataset
27
+ def create_qa_dataset_embeddings(expanded_qa_dataset):
28
+ qa_embeddings = []
29
+ questions = []
30
+ for item in expanded_qa_dataset:
31
+ questions.append(item['question'])
32
+ qa_embeddings.append(embedding_model.encode(item['question'], convert_to_tensor=True))
33
+ return qa_embeddings, questions
34
+
35
+ # Create QA dataset embeddings
36
+ qa_embeddings, qa_questions = create_qa_dataset_embeddings(expanded_qa_dataset)
37
 
38
  # Function to create embeddings for the knowledge base content
39
  def create_knowledge_base_embeddings(knowledge_base):
40
  embeddings = []
41
  for entry in knowledge_base:
42
  if 'title' in entry:
43
+ content = entry['title'] + ' '.join(
44
  [c.get('text', '') for c in entry.get('content', [])] +
45
+ [' '.join(step['details']) for c in entry.get('content', []) if 'steps' in c for step in c['steps']] +
46
+ [faq['question'] + ' ' + faq['answer'] for c in entry.get('content', []) if 'faq' in c for faq in c['faq']]
 
 
 
 
 
 
 
 
47
  )
48
  embeddings.append(embedding_model.encode(content, convert_to_tensor=True))
49
  return embeddings
 
51
  # Create knowledge base embeddings
52
  knowledge_base_embeddings = create_knowledge_base_embeddings(knowledge_base)
53
 
54
+ # Semantic search on expanded QA dataset
55
+ def search_expanded_qa(question):
 
56
  question_embedding = embedding_model.encode(question, convert_to_tensor=True)
57
+ cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(qa_embeddings))
58
+
59
+ best_match_idx = torch.argmax(cosine_scores).item()
60
+ best_match_score = cosine_scores[0, best_match_idx].item()
61
+
62
+ return expanded_qa_dataset[best_match_idx]['answer'], best_match_score
63
 
64
+ # Semantic search on knowledge base
65
+ def search_knowledge_base(question):
66
+ question_embedding = embedding_model.encode(question, convert_to_tensor=True)
67
  cosine_scores = util.pytorch_cos_sim(question_embedding, torch.stack(knowledge_base_embeddings))
68
 
 
69
  best_match_idx = torch.argmax(cosine_scores).item()
70
  best_match_score = cosine_scores[0, best_match_idx].item()
71
 
72
+ # Retrieve content from best matched knowledge base entry
73
+ best_match_entry = knowledge_base[best_match_idx]
74
+ for content_item in best_match_entry['content']:
75
+ if 'faq' in content_item:
76
+ for faq in content_item['faq']:
77
+ if faq['question'].lower() in question.lower():
78
+ return faq['answer'], best_match_score
79
+ if 'steps' in content_item:
80
+ step_details = [step['details'] for step in content_item['steps']]
81
+ return "\n".join(step_details), best_match_score
82
+ if 'text' in content_item:
83
+ return content_item['text'], best_match_score
84
+
85
+ return "Lo siento, no encontré una respuesta adecuada para tu pregunta.", best_match_score
86
+
87
+ # Answer function: search both datasets and return the best match
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def answer_question(question):
89
+ # Search expanded QA dataset
90
+ qa_answer, qa_score = search_expanded_qa(question)
91
+
92
+ # Search knowledge base
93
+ kb_answer, kb_score = search_knowledge_base(question)
94
 
95
+ # Compare scores and return the best answer
96
+ if qa_score >= kb_score:
97
+ return qa_answer
98
+ else:
99
+ return kb_answer
100
 
101
+ # Gradio interface
102
  interface = gr.Interface(
103
  fn=answer_question,
104
  inputs="text",
105
  outputs="text",
106
  title="OCN Customer Support Chatbot",
107
+ description="Ask questions and get answers from the OCN knowledge base and expanded QA dataset."
108
  )
109
 
110
+ # Launch the interface
111
  interface.launch(share=True)
requirements.txt CHANGED
@@ -1,7 +1,4 @@
1
- transformers==4.30.2
2
- torch==2.0.1
3
  sentence-transformers==2.2.2
4
- fuzzywuzzy==0.18.0
5
- scikit-learn==1.3.0
6
- gradio==3.16.2
7
- numpy==1.24.3
 
1
+ transformers==4.26.1
2
+ torch==1.13.1
3
  sentence-transformers==2.2.2
4
+ gradio==3.8.2