Uhhy commited on
Commit
6fc515c
1 Parent(s): a17dc9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -20,18 +20,25 @@ model_configs = [
20
  {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
21
  ]
22
 
23
- # Cargar un modelo
24
  def load_model(model_config):
 
25
  return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
26
 
27
- # Cargar todos los modelos simultáneamente
28
  def load_all_models():
 
29
  with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
30
  futures = [executor.submit(load_model, config) for config in model_configs]
31
- models = [future.result() for future in as_completed(futures)]
 
 
 
 
 
 
 
 
32
  return models
33
 
34
- # Cargar modelos en memoria
35
  llms = load_all_models()
36
 
37
  class ChatRequest(BaseModel):
@@ -40,7 +47,6 @@ class ChatRequest(BaseModel):
40
  top_p: float = 0.95
41
  temperature: float = 0.7
42
 
43
- # Función para generar respuestas de chat
44
  def generate_chat_response(request, llm):
45
  try:
46
  user_input = normalize_input(request.message)
@@ -72,13 +78,10 @@ def filter_duplicates(responses):
72
  return unique_responses
73
 
74
  def select_best_response(responses):
75
- # Eliminar respuestas duplicadas
76
  unique_responses = filter_duplicates(responses)
77
- # Deduplicar respuestas
78
  unique_responses = list(set(unique_responses))
79
- # Filtrar respuestas coherentes
80
  coherent_responses = filter_by_coherence(unique_responses)
81
- # Seleccionar la mejor respuesta
82
  best_response = filter_by_similarity(coherent_responses)
83
  return best_response
84
 
@@ -97,6 +100,7 @@ def filter_by_similarity(responses):
97
  return best_response
98
 
99
  def worker_function(llm, request, progress_bar):
 
100
  response = generate_chat_response(request, llm)
101
  progress_bar.update(1)
102
  return response
@@ -111,9 +115,7 @@ async def generate_chat(request: ChatRequest):
111
  responses = []
112
  num_models = len(llms)
113
 
114
- # Crear barra de progreso
115
  with tqdm(total=num_models, desc="Generando respuestas", unit="modelo") as progress_bar:
116
- # Ejecutar modelos en paralelo
117
  with ThreadPoolExecutor(max_workers=num_models) as executor:
118
  futures = [executor.submit(worker_function, llm, request, progress_bar) for llm in llms]
119
  for future in as_completed(futures):
@@ -123,7 +125,6 @@ async def generate_chat(request: ChatRequest):
123
  except Exception as exc:
124
  print(f"Error en la generación de respuesta: {exc}")
125
 
126
- # Seleccionar la mejor respuesta
127
  best_response = select_best_response(responses)
128
 
129
  print(f"Mejor respuesta seleccionada: {best_response}")
 
20
  {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
21
  ]
22
 
 
23
  def load_model(model_config):
24
+ print(f"Cargando modelo {model_config['repo_id']}...")
25
  return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
26
 
 
27
  def load_all_models():
28
+ print("Iniciando carga de modelos...")
29
  with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
30
  futures = [executor.submit(load_model, config) for config in model_configs]
31
+ models = []
32
+ for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
33
+ try:
34
+ model = future.result()
35
+ models.append(model)
36
+ print(f"Modelo cargado exitosamente: {model_configs[len(models)-1]['repo_id']}")
37
+ except Exception as e:
38
+ print(f"Error al cargar el modelo: {e}")
39
+ print("Todos los modelos han sido cargados.")
40
  return models
41
 
 
42
  llms = load_all_models()
43
 
44
  class ChatRequest(BaseModel):
 
47
  top_p: float = 0.95
48
  temperature: float = 0.7
49
 
 
50
  def generate_chat_response(request, llm):
51
  try:
52
  user_input = normalize_input(request.message)
 
78
  return unique_responses
79
 
80
  def select_best_response(responses):
81
+ print("Filtrando respuestas...")
82
  unique_responses = filter_duplicates(responses)
 
83
  unique_responses = list(set(unique_responses))
 
84
  coherent_responses = filter_by_coherence(unique_responses)
 
85
  best_response = filter_by_similarity(coherent_responses)
86
  return best_response
87
 
 
100
  return best_response
101
 
102
  def worker_function(llm, request, progress_bar):
103
+ print(f"Generando respuesta con el modelo...")
104
  response = generate_chat_response(request, llm)
105
  progress_bar.update(1)
106
  return response
 
115
  responses = []
116
  num_models = len(llms)
117
 
 
118
  with tqdm(total=num_models, desc="Generando respuestas", unit="modelo") as progress_bar:
 
119
  with ThreadPoolExecutor(max_workers=num_models) as executor:
120
  futures = [executor.submit(worker_function, llm, request, progress_bar) for llm in llms]
121
  for future in as_completed(futures):
 
125
  except Exception as exc:
126
  print(f"Error en la generación de respuesta: {exc}")
127
 
 
128
  best_response = select_best_response(responses)
129
 
130
  print(f"Mejor respuesta seleccionada: {best_response}")