# Build the docker in the repo dir: # docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 . # docker images # you can find your built docker # Support - Traing: fsdp; Inference: vllm # FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 # Support - Traing: fsdp; Inference: vllm, sglang FROM lmsysorg/sglang:v0.4.6.post5-rocm630 # Set working directory # WORKDIR $PWD/app # Set environment variables ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942" ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__" ENV CFLAGS="-D__HIP_PLATFORM_AMD__" ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__" # Install vllm RUN pip uninstall -y vllm && \ rm -rf vllm && \ git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \ cd vllm && \ MAX_JOBS=$(nproc) python3 setup.py install && \ cd .. && \ rm -rf vllm # Copy the entire project directory COPY . . # Install dependencies RUN pip install "tensordict==0.6.2" --no-deps && \ pip install accelerate \ codetiming \ datasets \ dill \ hydra-core \ liger-kernel \ numpy \ pandas \ peft \ "pyarrow>=15.0.0" \ pylatexenc \ "ray[data,train,tune,serve]<2.45.0" \ torchdata \ transformers \ wandb \ orjson \ pybind11 RUN git clone https://github.com/volcengine/verl.git && \ cd verl && \ pip install -e . # Install torch_memory_saver RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps