Uhhy commited on
Commit
d6a8693
1 Parent(s): e17bba9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -37
app.py CHANGED
@@ -6,50 +6,68 @@ from tqdm import tqdm
6
  import uvicorn
7
  from dotenv import load_dotenv
8
  from difflib import SequenceMatcher
9
- import threading
10
 
 
11
  load_dotenv()
12
 
 
13
  app = FastAPI()
14
 
 
 
 
 
 
15
  # Configuración de los modelos
16
  model_configs = [
17
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf"},
18
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf"},
19
  {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf"},
 
20
  {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf"},
21
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf"},
 
22
  {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf"},
23
- {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf"},
24
  ]
25
 
26
- def load_model(model_config):
27
- print(f"Cargando modelo {model_config['repo_id']}...")
28
- return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
29
-
30
- def load_all_models():
31
- print("Iniciando carga de modelos...")
32
- with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
33
- futures = [executor.submit(load_model, config) for config in model_configs]
34
- models = []
35
- for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
36
- try:
37
- model = future.result()
38
- models.append(model)
39
- print(f"Modelo cargado exitosamente: {model_configs[len(models)-1]['repo_id']}")
40
- except Exception as e:
41
- print(f"Error al cargar el modelo: {e}")
42
- print("Todos los modelos han sido cargados.")
43
- return models
44
-
45
- llms = load_all_models()
46
-
 
 
 
 
 
 
 
 
47
  class ChatRequest(BaseModel):
48
  message: str
49
  top_k: int = 50
50
  top_p: float = 0.95
51
  temperature: float = 0.7
52
 
 
53
  def generate_chat_response(request, llm):
54
  try:
55
  user_input = normalize_input(request.message)
@@ -67,32 +85,48 @@ def generate_chat_response(request, llm):
67
  def normalize_input(input_text):
68
  return input_text.strip()
69
 
70
- def filter_duplicates(responses):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  seen = set()
72
  unique_responses = []
73
  for response in responses:
74
- lines = response.split('\n')
75
- unique_lines = set()
76
- for line in lines:
77
- if line not in seen:
78
- seen.add(line)
79
- unique_lines.add(line)
80
- unique_responses.append('\n'.join(unique_lines))
81
  return unique_responses
82
 
83
  def select_best_response(responses):
84
  print("Filtrando respuestas...")
85
- unique_responses = filter_duplicates(responses)
86
- unique_responses = list(set(unique_responses))
 
87
  coherent_responses = filter_by_coherence(unique_responses)
88
  best_response = filter_by_similarity(coherent_responses)
89
  return best_response
90
 
91
  def filter_by_coherence(responses):
92
- # Implementa aquí un filtro de coherencia si es necesario
 
 
93
  return responses
94
 
95
  def filter_by_similarity(responses):
 
 
96
  responses.sort(key=len, reverse=True)
97
  best_response = responses[0]
98
  for i in range(1, len(responses)):
@@ -103,7 +137,7 @@ def filter_by_similarity(responses):
103
  return best_response
104
 
105
  def worker_function(llm, request, progress_bar):
106
- print(f"Generando respuesta con el modelo...")
107
  response = generate_chat_response(request, llm)
108
  progress_bar.update(1)
109
  return response
@@ -116,11 +150,11 @@ async def generate_chat(request: ChatRequest):
116
  print(f"Procesando solicitud: {request.message}")
117
 
118
  responses = []
119
- num_models = len(llms)
120
 
121
  with tqdm(total=num_models, desc="Generando respuestas", unit="modelo") as progress_bar:
122
  with ThreadPoolExecutor(max_workers=num_models) as executor:
123
- futures = [executor.submit(worker_function, llm, request, progress_bar) for llm in llms]
124
  for future in as_completed(futures):
125
  try:
126
  response = future.result()
 
6
  import uvicorn
7
  from dotenv import load_dotenv
8
  from difflib import SequenceMatcher
9
+ import re
10
 
11
+ # Cargar variables de entorno
12
  load_dotenv()
13
 
14
+ # Inicializar aplicación FastAPI
15
  app = FastAPI()
16
 
17
+ # Diccionario global para almacenar los modelos
18
+ global_data = {
19
+ 'models': []
20
+ }
21
+
22
  # Configuración de los modelos
23
  model_configs = [
24
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf"},
25
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf"},
26
  {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf"},
27
+ {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
28
  {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf"},
29
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf"},
30
+ {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf"},
31
  {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf"},
32
+ {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf"}
33
  ]
34
 
35
+ # Clase para gestionar modelos
36
+ class ModelManager:
37
+ def __init__(self):
38
+ self.models = []
39
+
40
+ def load_model(self, model_config):
41
+ print(f"Cargando modelo {model_config['repo_id']}...")
42
+ return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
43
+
44
+ def load_all_models(self):
45
+ print("Iniciando carga de modelos...")
46
+ with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
47
+ futures = [executor.submit(self.load_model, config) for config in model_configs]
48
+ models = []
49
+ for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
50
+ try:
51
+ model = future.result()
52
+ models.append(model)
53
+ print(f"Modelo cargado exitosamente: {model_configs[len(models)-1]['repo_id']}")
54
+ except Exception as e:
55
+ print(f"Error al cargar el modelo: {e}")
56
+ print("Todos los modelos han sido cargados.")
57
+ return models
58
+
59
+ # Instanciar ModelManager y cargar modelos
60
+ model_manager = ModelManager()
61
+ global_data['models'] = model_manager.load_all_models()
62
+
63
+ # Modelo global para la solicitud de chat
64
  class ChatRequest(BaseModel):
65
  message: str
66
  top_k: int = 50
67
  top_p: float = 0.95
68
  temperature: float = 0.7
69
 
70
+ # Función para generar respuestas de chat
71
  def generate_chat_response(request, llm):
72
  try:
73
  user_input = normalize_input(request.message)
 
85
  def normalize_input(input_text):
86
  return input_text.strip()
87
 
88
+ def remove_duplicates(text):
89
+ # Eliminar patrones repetitivos específicos
90
+ text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
91
+ text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
92
+
93
+ # Eliminar el marcador [/INST]
94
+ text = text.replace('[/INST]', '')
95
+
96
+ # Generaliza la eliminación de duplicados
97
+ lines = text.split('\n')
98
+ unique_lines = list(dict.fromkeys(lines))
99
+ return '\n'.join(unique_lines).strip()
100
+
101
+ def remove_repetitive_responses(responses):
102
+ # Filtra respuestas repetitivas
103
  seen = set()
104
  unique_responses = []
105
  for response in responses:
106
+ normalized_response = remove_duplicates(response)
107
+ if normalized_response not in seen:
108
+ seen.add(normalized_response)
109
+ unique_responses.append(normalized_response)
 
 
 
110
  return unique_responses
111
 
112
  def select_best_response(responses):
113
  print("Filtrando respuestas...")
114
+ responses = remove_repetitive_responses(responses)
115
+ responses = [remove_duplicates(response) for response in responses]
116
+ unique_responses = list(set(responses))
117
  coherent_responses = filter_by_coherence(unique_responses)
118
  best_response = filter_by_similarity(coherent_responses)
119
  return best_response
120
 
121
  def filter_by_coherence(responses):
122
+ # Ordenar respuestas por longitud y similaridad para coherencia básica
123
+ print("Ordenando respuestas por coherencia...")
124
+ responses.sort(key=len, reverse=True)
125
  return responses
126
 
127
  def filter_by_similarity(responses):
128
+ # Seleccionar la respuesta más coherente y única
129
+ print("Filtrando respuestas por similitud...")
130
  responses.sort(key=len, reverse=True)
131
  best_response = responses[0]
132
  for i in range(1, len(responses)):
 
137
  return best_response
138
 
139
  def worker_function(llm, request, progress_bar):
140
+ print(f"Generando respuesta con el modelo {llm}...")
141
  response = generate_chat_response(request, llm)
142
  progress_bar.update(1)
143
  return response
 
150
  print(f"Procesando solicitud: {request.message}")
151
 
152
  responses = []
153
+ num_models = len(global_data['models'])
154
 
155
  with tqdm(total=num_models, desc="Generando respuestas", unit="modelo") as progress_bar:
156
  with ThreadPoolExecutor(max_workers=num_models) as executor:
157
+ futures = [executor.submit(worker_function, llm, request, progress_bar) for llm in global_data['models']]
158
  for future in as_completed(futures):
159
  try:
160
  response = future.result()