-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDockerfile.vllm
More file actions
45 lines (42 loc) · 1.81 KB
/
Dockerfile.vllm
File metadata and controls
45 lines (42 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# ============================================================
# vLLM Inference Server — Cloud Run GPU Service
# Base: vllm/vllm-openai (OpenAI-compatible API)
# GPU: NVIDIA L4 (23 GB VRAM — fits most 7-8B models in BF16)
#
# Exposes an OpenAI-compatible API at /v1.
# Model weights are loaded from GCS (mounted at /hf-cache) so
# cold starts use cached weights instead of re-downloading.
# ============================================================
FROM vllm/vllm-openai:latest
# Cache paths — GCS bucket is mounted at /hf-cache at runtime via
# --add-volume in gcloud run deploy (see deploy.yml).
ENV HF_HOME=/hf-cache
ENV VLLM_CACHE_ROOT=/hf-cache/vllm_cache
ENV TOKENIZERS_PARALLELISM=false
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV HF_HUB_DISABLE_XET=1
ENV PORT=8080
EXPOSE 8080
# ── Customize the CMD for your model ─────────────────────────────────────────
# vllm/vllm-openai ENTRYPOINT is: vllm serve
# First positional arg is the HuggingFace model ID.
#
# Key flags to tune:
# --served-model-name name exposed by /v1/models (must match VLLM_MODEL_NAME in demo)
# --max-model-len context window in tokens
# --max-num-seqs max concurrent in-flight sequences (L4 sweet spot ~16 for 8B)
# --dtype bfloat16 for Llama/Qwen/Mistral; float16 for older models
# --tensor-parallel-size use >1 for multi-GPU (not needed for L4 single-GPU)
#
# For gated models (e.g. Llama 3), pass --hf-token via env var in deploy.yml.
CMD [ \
"Qwen/Qwen2.5-3B-Instruct", \
"--served-model-name", "qwen2.5-3b-instruct", \
"--port", "8080", \
"--host", "0.0.0.0", \
"--max-model-len", "8192", \
"--max-num-seqs", "32", \
"--dtype", "bfloat16", \
"--enable-chunked-prefill", \
"--enforce-eager" \
]