aieng-template-cloudrun-vllm/Dockerfile.vllm at main · VectorInstitute/aieng-template-cloudrun-vllm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# ============================================================
# vLLM Inference Server — Cloud Run GPU Service
# Base: vllm/vllm-openai (OpenAI-compatible API)
# GPU: NVIDIA L4 (23 GB VRAM — fits most 7-8B models in BF16)
#
# Exposes an OpenAI-compatible API at /v1.
# Model weights are loaded from GCS (mounted at /hf-cache) so
# cold starts use cached weights instead of re-downloading.
# ============================================================
FROM vllm/vllm-openai:latest

# Cache paths — GCS bucket is mounted at /hf-cache at runtime via
# --add-volume in gcloud run deploy (see deploy.yml).
ENV HF_HOME=/hf-cache
ENV VLLM_CACHE_ROOT=/hf-cache/vllm_cache
ENV TOKENIZERS_PARALLELISM=false
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV HF_HUB_DISABLE_XET=1

ENV PORT=8080
EXPOSE 8080

# ── Customize the CMD for your model ─────────────────────────────────────────
# vllm/vllm-openai ENTRYPOINT is: vllm serve
# First positional arg is the HuggingFace model ID.
#
# Key flags to tune:
#   --served-model-name   name exposed by /v1/models (must match VLLM_MODEL_NAME in demo)
#   --max-model-len       context window in tokens
#   --max-num-seqs        max concurrent in-flight sequences (L4 sweet spot ~16 for 8B)
#   --dtype               bfloat16 for Llama/Qwen/Mistral; float16 for older models
#   --tensor-parallel-size  use >1 for multi-GPU (not needed for L4 single-GPU)
#
# For gated models (e.g. Llama 3), pass --hf-token via env var in deploy.yml.
CMD [ \
    "Qwen/Qwen2.5-3B-Instruct", \
    "--served-model-name", "qwen2.5-3b-instruct", \
    "--port", "8080", \
    "--host", "0.0.0.0", \
    "--max-model-len", "8192", \
    "--max-num-seqs", "32", \
    "--dtype", "bfloat16", \
    "--enable-chunked-prefill", \
    "--enforce-eager" \
]