Spaces:
Paused
Paused
lock vllm v0.4.3
Browse files- Dockerfile +1 -1
- api_server.py +1 -5
Dockerfile
CHANGED
@@ -14,7 +14,7 @@ RUN pip3 install "torch==2.1.1"
|
|
14 |
# This build is slow but NVIDIA does not provide binaries. Increase MAX_JOBS as needed.
|
15 |
# RUN pip3 install "git+https://github.com/stanford-futuredata/megablocks.git"
|
16 |
RUN pip3 install -U openai
|
17 |
-
RUN pip3 install
|
18 |
RUN pip3 install -U pydantic
|
19 |
RUN pip3 install -U aioprometheus
|
20 |
|
|
|
14 |
# This build is slow but NVIDIA does not provide binaries. Increase MAX_JOBS as needed.
|
15 |
# RUN pip3 install "git+https://github.com/stanford-futuredata/megablocks.git"
|
16 |
RUN pip3 install -U openai
|
17 |
+
RUN pip3 install vllm==0.4.3
|
18 |
RUN pip3 install -U pydantic
|
19 |
RUN pip3 install -U aioprometheus
|
20 |
|
api_server.py
CHANGED
@@ -29,7 +29,6 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
|
29 |
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
30 |
from vllm.logger import init_logger
|
31 |
from vllm.usage.usage_lib import UsageContext
|
32 |
-
from vllm.utils import FlexibleArgumentParser
|
33 |
|
34 |
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
35 |
|
@@ -60,11 +59,8 @@ async def lifespan(app: fastapi.FastAPI):
|
|
60 |
|
61 |
app = fastapi.FastAPI(lifespan=lifespan)
|
62 |
|
63 |
-
|
64 |
def parse_args():
|
65 |
-
|
66 |
-
description="vLLM OpenAI-Compatible RESTful API server.")
|
67 |
-
parser = make_arg_parser(parser_text)
|
68 |
return parser.parse_args()
|
69 |
|
70 |
|
|
|
29 |
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
30 |
from vllm.logger import init_logger
|
31 |
from vllm.usage.usage_lib import UsageContext
|
|
|
32 |
|
33 |
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
34 |
|
|
|
59 |
|
60 |
app = fastapi.FastAPI(lifespan=lifespan)
|
61 |
|
|
|
62 |
def parse_args():
|
63 |
+
parser = make_arg_parser()
|
|
|
|
|
64 |
return parser.parse_args()
|
65 |
|
66 |
|