Luke Stanley commited on
Commit
9475016
1 Parent(s): 976ea17

Ensure N_GPU_LAYERS is int

Browse files
Files changed (1) hide show
  1. utils.py +25 -1
utils.py CHANGED
@@ -19,7 +19,7 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
19
  URL = "http://localhost:5834/v1/chat/completions"
20
  in_memory_llm = None
21
 
22
- N_GPU_LAYERS = env.get("N_GPU_LAYERS", -1) # Default to -1, which means use all layers if available
23
  CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 4096))
24
  LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
25
  USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
@@ -147,3 +147,27 @@ def query_ai_prompt(prompt, replacements, model_class, in_memory=True):
147
  return llm_stream_sans_network(prompt, model_class)
148
  else:
149
  return llm_streaming(prompt, model_class)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  URL = "http://localhost:5834/v1/chat/completions"
20
  in_memory_llm = None
21
 
22
+ N_GPU_LAYERS = int(env.get("N_GPU_LAYERS", 20)) # Default to -1, which means use all layers if available
23
  CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 4096))
24
  LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
25
  USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
 
147
  return llm_stream_sans_network(prompt, model_class)
148
  else:
149
  return llm_streaming(prompt, model_class)
150
+
151
+
152
+ def llm_stream_sans_network_simple(
153
+ prompt: str, json_schema:str
154
+ ):
155
+ grammar = LlamaGrammar.from_json_schema(json_schema)
156
+
157
+ stream = in_memory_llm(
158
+ prompt,
159
+ max_tokens=MAX_TOKENS,
160
+ temperature=TEMPERATURE,
161
+ grammar=grammar,
162
+ stream=True
163
+ )
164
+
165
+ output_text = ""
166
+ for chunk in stream:
167
+ result = chunk["choices"][0]
168
+ print(result["text"], end='', flush=True)
169
+ output_text = output_text + result["text"]
170
+ #yield result["text"]
171
+
172
+ print('\n')
173
+ return output_text