Luke Stanley commited on
Commit
976ea17
1 Parent(s): 233efeb

Expose json typed LLM interface for RunPod

Browse files
Files changed (4) hide show
  1. docker-compose.yml +11 -0
  2. runpod.dockerfile +12 -2
  3. runpod_handler.py +22 -75
  4. test.sh +28 -0
docker-compose.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+ services:
3
+ runpod:
4
+ build:
5
+ context: .
6
+ dockerfile: runpod.dockerfile
7
+ volumes:
8
+ - ./.cache:/runpod-volume/.cache
9
+ - ./test.sh:/test.sh
10
+ command: /test.sh
11
+ entrypoint: /usr/bin/python3
runpod.dockerfile CHANGED
@@ -15,10 +15,20 @@ RUN python3.11 -m pip install pytest cmake \
15
  huggingface_hub hf_transfer \
16
  pydantic pydantic_settings \
17
  llama-cpp-python
18
-
19
  # Install llama-cpp-python (build with cuda)
20
  ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on"
21
- RUN python3.11 -m pip install llama-cpp-python --upgrade --no-cache-dir --force-reinstall
 
22
  ADD runpod_handler.py .
23
 
 
 
 
 
 
 
 
 
24
  CMD python3.11 -u /runpod_handler.py
 
 
15
  huggingface_hub hf_transfer \
16
  pydantic pydantic_settings \
17
  llama-cpp-python
18
+
19
  # Install llama-cpp-python (build with cuda)
20
  ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on"
21
+ RUN python3.11 -m pip install git+https://github.com/lukestanley/llama-cpp-python.git@expose_json_grammar_convert_function --upgrade --no-cache-dir --force-reinstall
22
+ RUN apt-get update; apt-get install jq -y
23
  ADD runpod_handler.py .
24
 
25
+ ADD chill.py .
26
+ ADD utils.py .
27
+ ADD promptObjects.py .
28
+
29
+ #ENV REPO_ID="TheBloke/phi-2-GGUF"
30
+ #ENV MODEL_FILE="phi-2.Q2_K.gguf"
31
+ ENV N_GPU_LAYERS=-1
32
+ ENV CONTEXT_SIZE=2048
33
  CMD python3.11 -u /runpod_handler.py
34
+
runpod_handler.py CHANGED
@@ -1,34 +1,7 @@
1
- import json
2
  from os import environ as env
3
- from typing import Any, Dict, Union
4
- from llama_cpp import Llama, LlamaGrammar
5
  from pydantic import BaseModel, Field
6
- import runpod
7
-
8
-
9
- # If your handler runs inference on a model, load the model here.
10
- # You will want models to be loaded into memory before starting serverless.
11
- from huggingface_hub import hf_hub_download
12
- small_repo = "TheBloke/phi-2-GGUF"
13
- small_model="phi-2.Q2_K.gguf"
14
- big_repo = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
15
- big_model = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
16
- LLM_MODEL_PATH =hf_hub_download(
17
- repo_id=big_repo,
18
- filename=big_model,
19
- )
20
- print(f"Model downloaded to {LLM_MODEL_PATH}")
21
-
22
-
23
-
24
- in_memory_llm = None
25
-
26
- N_GPU_LAYERS = env.get("N_GPU_LAYERS", -1) # Default to -1, which means use all layers if available
27
- CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
28
- USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
29
- MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
30
- TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
31
-
32
  class Movie(BaseModel):
33
  title: str = Field(..., title="The title of the movie")
34
  year: int = Field(..., title="The year the movie was released")
@@ -36,17 +9,7 @@ class Movie(BaseModel):
36
  genre: str = Field(..., title="The genre of the movie")
37
  plot: str = Field(..., title="Plot summary of the movie")
38
 
39
- JSON_EXAMPLE_MOVIE = """
40
- { "title": "The Matrix", "year": 1999, "director": "The Wachowskis", "genre": "Science Fiction", "plot":"Prgrammer realises he lives in simulation and plays key role."
41
- """
42
-
43
- if in_memory_llm is None:
44
- print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
45
- in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
46
-
47
- def llm_stream_sans_network(
48
- prompt: str, pydantic_model_class=Movie, return_pydantic_object=False
49
- ) -> Union[str, Dict[str, Any]]:
50
  schema = pydantic_model_class.model_json_schema()
51
 
52
  # Optional example field from schema, is not needed for the grammar generation
@@ -54,41 +17,25 @@ def llm_stream_sans_network(
54
  del schema["example"]
55
 
56
  json_schema = json.dumps(schema)
57
- grammar = LlamaGrammar.from_json_schema(json_schema)
58
-
59
- stream = in_memory_llm(
60
- prompt,
61
- max_tokens=MAX_TOKENS,
62
- temperature=TEMPERATURE,
63
- grammar=grammar,
64
- stream=True
65
- )
66
-
67
- output_text = ""
68
- for chunk in stream:
69
- result = chunk["choices"][0]
70
- print(result["text"], end='', flush=True)
71
- output_text = output_text + result["text"]
72
-
73
- print('\n')
74
-
75
- if return_pydantic_object:
76
- model_object = pydantic_model_class.model_validate_json(output_text)
77
- return model_object
78
- else:
79
- return output_text
80
-
81
-
82
  def handler(job):
83
  """ Handler function that will be used to process jobs. """
84
  job_input = job['input']
85
-
86
- name = job_input.get('name', 'World')
87
-
88
- #return f"Hello, {name}!"
89
- return llm_stream_sans_network(
90
- f"""You need to output JSON objects describing movies.
91
- For example for the movie called: `The Matrix`: Output: {JSON_EXAMPLE_MOVIE}
92
- Instruct: Output the JSON object for the movie: `{name}` Output: """)
93
-
94
- runpod.serverless.start({"handler": handler})
 
 
 
 
 
1
+ import runpod
2
  from os import environ as env
3
+ import json
 
4
  from pydantic import BaseModel, Field
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  class Movie(BaseModel):
6
  title: str = Field(..., title="The title of the movie")
7
  year: int = Field(..., title="The year the movie was released")
 
9
  genre: str = Field(..., title="The genre of the movie")
10
  plot: str = Field(..., title="Plot summary of the movie")
11
 
12
+ def pydantic_model_to_json_schema(pydantic_model_class):
 
 
 
 
 
 
 
 
 
 
13
  schema = pydantic_model_class.model_json_schema()
14
 
15
  # Optional example field from schema, is not needed for the grammar generation
 
17
  del schema["example"]
18
 
19
  json_schema = json.dumps(schema)
20
+ return json_schema
21
+ default_schema_example = """{ "title": ..., "year": ..., "director": ..., "genre": ..., "plot":...}"""
22
+ default_schema = pydantic_model_to_json_schema(Movie)
23
+ default_prompt = f"Instruct: \nOutput a JSON object in this format: {default_schema_example} for the following movie: The Matrix\nOutput:\n"
24
+ from utils import llm_stream_sans_network_simple
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def handler(job):
26
  """ Handler function that will be used to process jobs. """
27
  job_input = job['input']
28
+ filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf")
29
+ prompt = job_input.get('prompt', default_prompt)
30
+ schema = job_input.get('schema', default_schema)
31
+ print("got this input", str(job_input))
32
+ print("prompt", prompt )
33
+ print("schema", schema )
34
+ output = llm_stream_sans_network_simple(prompt, schema)
35
+ #print("got this output", str(output))
36
+ return f"model:{filename}\n{output}"
37
+
38
+ runpod.serverless.start({
39
+ "handler": handler,
40
+ #"return_aggregate_stream": True
41
+ })
test.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os, json
3
+
4
+ # Define your JSON and prompt as Python dictionaries and strings
5
+ schema = {
6
+ "properties": {
7
+ "title": {"title": "The title of the movie", "type": "string"},
8
+ "year": {"title": "The year the movie was released", "type": "integer"},
9
+ "director": {"title": "The director of the movie", "type": "string"},
10
+ "genre": {"title": "The genre of the movie", "type": "string"},
11
+ "plot": {"title": "Plot summary of the movie", "type": "string"}
12
+ },
13
+ "required": ["title", "year", "director", "genre", "plot"],
14
+ "title": "Movie",
15
+ "type": "object"
16
+ }
17
+
18
+ movie ="Toy Story"
19
+ prompt = "Instruct: Output a JSON object in this format: { \"title\": ..., \"year\": ..., \"director\": ..., \"genre\": ..., \"plot\":...} for the following movie: "+movie+"\nOutput:\n"
20
+
21
+ # Construct the JSON input string
22
+ json_input = json.dumps({"input": {"schema": json.dumps(schema), "prompt": prompt}})
23
+ print(json_input)
24
+ # Define the command to execute your Python script with the JSON string
25
+ command = f'python3.11 runpod_handler.py --test_input \'{json_input}\''
26
+
27
+ # Execute the command
28
+ os.system(command)