forked from thu-pacman/chitu
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDockerfile
More file actions
176 lines (145 loc) · 7 KB
/
Dockerfile
File metadata and controls
176 lines (145 loc) · 7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#####################################
# Base Image Stage
FROM pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel AS base
SHELL ["/bin/bash", "-c"]
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ARG optional_deps='flash_attn,flash_mla,flashinfer'
ARG chitu_setup_jobs=''
ARG enable_cython='true'
ARG enable_test='false'
ARG FLASH_ATTENTION_FORCE_BUILD="TRUE"
ENV CHITU_SETUP_JOBS=$chitu_setup_jobs
ENV MAX_JOBS=$CHITU_SETUP_JOBS
RUN if [ "${enable_cython}" != "true" ] && [ "${enable_cython}" != "false" ]; then \
echo "ARG enable_cython must either be 'true' or 'false'"; \
exit 1; \
fi
RUN if [ "${enable_test}" != "true" ] && [ "${enable_test}" != "false" ]; then \
echo "ARG enable_test must either be 'true' or 'false'"; \
exit 1; \
fi
# Required for non-interactive apt install
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
RUN apt update -y && apt install -y git gcc-10 g++-10 libnuma-dev libibverbs1 ibverbs-providers libibverbs-dev rdma-core curl
# Download prometheus
RUN --mount=type=secret,id=tos_id \
--mount=type=secret,id=tos_key \
mkdir -p /workspace/prometheus && \
case "$(uname -m)" in \
x86_64|amd64) \
GITHUB_URL="https://github.com/prometheus/prometheus/releases/download/v3.9.1/prometheus-3.9.1.linux-amd64.tar.gz" && \
TOS_URL="tos://out-deliver/prometheus-3.9.1.linux-amd64.tar" && \
TOOL_URL="https://tos-tools.tos-cn-beijing.volces.com/linux/tosutil" \
;; \
aarch64|arm64) \
GITHUB_URL="https://github.com/prometheus/prometheus/releases/download/v3.9.1/prometheus-3.9.1.linux-arm64.tar.gz" && \
TOS_URL="tos://out-deliver/prometheus-3.9.1.linux-arm64.tar" && \
TOOL_URL="https://m645b3e1bb36e-mrap.mrap.accesspoint.tos-global.volces.com/linux/arm64/tosutil" \
;; \
*) \
echo "Unsupport arch: $(uname -m)" && exit 1 \
;; \
esac && \
if [ -s /run/secrets/tos_id ] && [ -s /run/secrets/tos_key ]; then \
echo "tos_id and tos_id exits, download prometheus from Tos" && \
tos_id=$(cat /run/secrets/tos_id) && \
tos_key=$(cat /run/secrets/tos_key) && \
mkdir -p /workspace/prometheus && \
mkdir -p /tmp && curl "${TOOL_URL}" --output /tmp/tosutil && chmod a+x /tmp/tosutil && \
/tmp/tosutil cp -u -r -p=8 -j=8 -threshold=104857600 -k "${tos_key}" -i "${tos_id}" \
-e tos-cn-beijing.volces.com -re out-deliver.tos-cn-beijing.volces.com "${TOS_URL}" /workspace && \
tar -xf /workspace/prometheus-*.tar --strip-components=1 -C /workspace/prometheus && \
rm -rf /workspace/prometheus-*.tar && \
rm -rf /tmp/tosutil; \
else \
echo "Download prometheus from GitHub" && \
curl -L --retry 3 --retry-delay 5 -o /workspace/prometheus.tar.gz "${GITHUB_URL}" && \
tar -xzvf /workspace/prometheus.tar.gz --strip-components=1 -C /workspace/prometheus && \
rm -rf /workspace/prometheus.tar.gz; \
fi && \
cp /workspace/prometheus/prometheus /usr/local/bin && \
cp /workspace/prometheus/promtool /usr/local/bin/ && \
rm -rf /workspace/prometheus && \
prometheus --version
# NOTE: Always apt update before apt install to avoid out-dated docker cache
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -U "pip<25.3" -i https://pypi.tuna.tsinghua.edu.cn/simple
# NOTE: Always apt update before apt install to avoid out-dated docker cache
# NOTE: Test dependencies include:
# - pytest is for test/pytest (for all platforms).
# - aiohttp is for service tests (for all platforms).
# - matplotlib is for benchmarks/op_bench (for platforms with triton).
RUN if [ "${enable_test}" = "true" ]; then \
apt update -y && apt install -y expect vim tmux telnet htop lsof strace iputils-ping && \
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pytest aiohttp lark-oapi matplotlib; \
fi
# Always install build time dependencies. Some dependencies may fail to build
# if some build time dependencies are missing.
COPY ./requirements-build.txt /tmp/requirements-build.txt
COPY ./requirements-build-deep_ep-cu12.txt /tmp/requirements-build-deep_ep-cu12.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r /tmp/requirements-build.txt \
-c <(pip list --format freeze | grep -v "setuptools")
RUN if [[ "${optional_deps}" == *"deep_ep"* ]]; then \
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r /tmp/requirements-build-deep_ep-cu12.txt \
-c <(pip list --format freeze | grep -v "setuptools"); \
fi
ENV FLASH_MLA_DISABLE_SM100=1
#####################################
# Dependency Resolver Stage
#
# The only purpose of this stage is to generate a requirements.txt file. This
# stage may trigger rebuild whenever there is any change in the source code,
# but this stage runs fast.
FROM base AS dependency_resolver
WORKDIR /workspace/chitu
COPY . .
RUN ./gen_tmp_requirements_txt.py "${optional_deps}" > /tmp/requirements.txt
#####################################
# Dependency Installer Stage
#
# This stage installs the dependencies listed in requirements.txt. Some of the
# dependencies may require compilation, so this stage may take a long time, but
# this stage only triggers rebuild when the requirements.txt file changes, or
# this source of the dependencies changes.
FROM base AS dependency_installer
WORKDIR /workspace/chitu
COPY --from=dependency_resolver /tmp/requirements.txt /tmp/requirements.txt
# Don't use `--mount=type=cache,target=/root/.cache/pip` here, because some dependencies
# compile at install time, and the compile results are environment dependent.
RUN --mount=type=bind,source=./third_party,target=./third_party,readwrite \
--mount=type=bind,source=./csrc/cpuinfer,target=./csrc/cpuinfer,readwrite \
pip install --no-build-isolation -i https://pypi.tuna.tsinghua.edu.cn/simple -r /tmp/requirements.txt \
-c <(pip list --format freeze | grep -v -e "pillow" -e "fsspec" -e "numpy" -e "transformers" -e "pytest")
#####################################
# Wheel build Stage
#
# This stage build wheel file of chitu.
FROM dependency_installer AS wheel_builder
ARG enable_cython
WORKDIR /workspace/chitu
COPY . .
# build wheel of chitu
RUN ./script/build_for_dist.sh "${enable_cython}"
# verify the wheel was created
RUN cp dist/*.whl /tmp/
RUN ls -al /tmp/
RUN rm -rf /workspace/chitu/*
#####################################
# Build Stage
#
# This stage builds chitu.
FROM dependency_installer AS build
COPY --from=wheel_builder /tmp/ /tmp/
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple /tmp/*.whl \
-c <(pip list --format freeze | grep -v -e "pillow" -e "fsspec" -e "flash-mla" -e "flash_mla" -e "numpy" -e "transformers" -e "pytest")
RUN rm -rf /tmp/*
COPY ./test ./test
COPY ./script ./script
COPY ./benchmarks ./benchmarks
# These are optimization flags for NCCL, but according to our tests, they only make things
# worse, so we don't use them.
ENV NCCL_GRAPH_MIXING_SUPPORT=0
ENV NCCL_GRAPH_REGISTER=0