Dockerfile.rocm_verl-0.3.0.post1

#  Build the docker in the repo dir:
# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
# docker images # you can find your built docker


# Support - Traing: fsdp; Inference: vllm
# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.6.post5-rocm630

# Set working directory
# WORKDIR $PWD/app

# Set environment variables
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"

ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"

# Install vllm
RUN pip uninstall -y vllm && \
    rm -rf vllm && \
    git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
    cd vllm && \
    MAX_JOBS=$(nproc) python3 setup.py install && \
    cd .. && \
    rm -rf vllm

# Copy the entire project directory
COPY . .

# Install dependencies
RUN pip install "tensordict==0.6.2" --no-deps && \
    pip install accelerate \
    codetiming \
    datasets \
    dill \
    hydra-core \
    liger-kernel \
    numpy \
    pandas \
    peft \
    "pyarrow>=15.0.0" \
    pylatexenc \
    "ray[data,train,tune,serve]<2.45.0" \
    torchdata \
    transformers \
    wandb \
    orjson \
    pybind11
    
RUN git clone https://github.com/volcengine/verl.git && \
    cd verl && \
    pip install -e . 

# Install torch_memory_saver
RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps