Luke Stanley commited on
Commit
233efeb
1 Parent(s): feeb679

RunPod Mixtral JSON output test

Browse files
Files changed (2) hide show
  1. runpod.dockerfile +9 -0
  2. runpod_handler.py +80 -4
runpod.dockerfile CHANGED
@@ -10,6 +10,15 @@ ENV HF_HOME="/runpod-volume/.cache/huggingface/"
10
  RUN python3.11 -m pip install --upgrade pip && \
11
  python3.11 -m pip install runpod==1.6.0
12
 
 
 
 
 
 
 
 
 
 
13
  ADD runpod_handler.py .
14
 
15
  CMD python3.11 -u /runpod_handler.py
 
10
  RUN python3.11 -m pip install --upgrade pip && \
11
  python3.11 -m pip install runpod==1.6.0
12
 
13
+ RUN python3.11 -m pip install pytest cmake \
14
+ scikit-build setuptools pydantic-settings \
15
+ huggingface_hub hf_transfer \
16
+ pydantic pydantic_settings \
17
+ llama-cpp-python
18
+
19
+ # Install llama-cpp-python (build with cuda)
20
+ ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on"
21
+ RUN python3.11 -m pip install llama-cpp-python --upgrade --no-cache-dir --force-reinstall
22
  ADD runpod_handler.py .
23
 
24
  CMD python3.11 -u /runpod_handler.py
runpod_handler.py CHANGED
@@ -1,9 +1,82 @@
1
- """ Example handler file. """
2
-
 
 
 
3
  import runpod
4
 
 
5
  # If your handler runs inference on a model, load the model here.
6
  # You will want models to be loaded into memory before starting serverless.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def handler(job):
@@ -12,7 +85,10 @@ def handler(job):
12
 
13
  name = job_input.get('name', 'World')
14
 
15
- return f"Hello, {name}!"
16
-
 
 
 
17
 
18
  runpod.serverless.start({"handler": handler})
 
1
+ import json
2
+ from os import environ as env
3
+ from typing import Any, Dict, Union
4
+ from llama_cpp import Llama, LlamaGrammar
5
+ from pydantic import BaseModel, Field
6
  import runpod
7
 
8
+
9
  # If your handler runs inference on a model, load the model here.
10
  # You will want models to be loaded into memory before starting serverless.
11
+ from huggingface_hub import hf_hub_download
12
+ small_repo = "TheBloke/phi-2-GGUF"
13
+ small_model="phi-2.Q2_K.gguf"
14
+ big_repo = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
15
+ big_model = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
16
+ LLM_MODEL_PATH =hf_hub_download(
17
+ repo_id=big_repo,
18
+ filename=big_model,
19
+ )
20
+ print(f"Model downloaded to {LLM_MODEL_PATH}")
21
+
22
+
23
+
24
+ in_memory_llm = None
25
+
26
+ N_GPU_LAYERS = env.get("N_GPU_LAYERS", -1) # Default to -1, which means use all layers if available
27
+ CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
28
+ USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
29
+ MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
30
+ TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
31
+
32
+ class Movie(BaseModel):
33
+ title: str = Field(..., title="The title of the movie")
34
+ year: int = Field(..., title="The year the movie was released")
35
+ director: str = Field(..., title="The director of the movie")
36
+ genre: str = Field(..., title="The genre of the movie")
37
+ plot: str = Field(..., title="Plot summary of the movie")
38
+
39
+ JSON_EXAMPLE_MOVIE = """
40
+ { "title": "The Matrix", "year": 1999, "director": "The Wachowskis", "genre": "Science Fiction", "plot":"Prgrammer realises he lives in simulation and plays key role."
41
+ """
42
+
43
+ if in_memory_llm is None:
44
+ print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
45
+ in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
46
+
47
+ def llm_stream_sans_network(
48
+ prompt: str, pydantic_model_class=Movie, return_pydantic_object=False
49
+ ) -> Union[str, Dict[str, Any]]:
50
+ schema = pydantic_model_class.model_json_schema()
51
+
52
+ # Optional example field from schema, is not needed for the grammar generation
53
+ if "example" in schema:
54
+ del schema["example"]
55
+
56
+ json_schema = json.dumps(schema)
57
+ grammar = LlamaGrammar.from_json_schema(json_schema)
58
+
59
+ stream = in_memory_llm(
60
+ prompt,
61
+ max_tokens=MAX_TOKENS,
62
+ temperature=TEMPERATURE,
63
+ grammar=grammar,
64
+ stream=True
65
+ )
66
+
67
+ output_text = ""
68
+ for chunk in stream:
69
+ result = chunk["choices"][0]
70
+ print(result["text"], end='', flush=True)
71
+ output_text = output_text + result["text"]
72
+
73
+ print('\n')
74
+
75
+ if return_pydantic_object:
76
+ model_object = pydantic_model_class.model_validate_json(output_text)
77
+ return model_object
78
+ else:
79
+ return output_text
80
 
81
 
82
  def handler(job):
 
85
 
86
  name = job_input.get('name', 'World')
87
 
88
+ #return f"Hello, {name}!"
89
+ return llm_stream_sans_network(
90
+ f"""You need to output JSON objects describing movies.
91
+ For example for the movie called: `The Matrix`: Output: {JSON_EXAMPLE_MOVIE}
92
+ Instruct: Output the JSON object for the movie: `{name}` Output: """)
93
 
94
  runpod.serverless.start({"handler": handler})