build.sh 7.85 KB
Newer Older
lijian6's avatar
lijian6 committed
1
#!/bin/bash
lishen's avatar
lishen committed
2
# set -eux
lijian6's avatar
lijian6 committed
3

lijian6's avatar
lijian6 committed
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
export amd_comgr_DIR=${ROCM_PATH}/lib64/cmake
llvm15_path=${ROCM_PATH}/llvm/lib/clang/15.0.0
llvm17_path=${ROCM_PATH}/llvm/lib/clang/17.0.0
llvm18_path=${ROCM_PATH}/llvm/lib/clang/18

if [ -d "${llvm15_path}" ]; then
    echo "llvm version is 15.0.0"
    llvm_path=${llvm15_path}
fi
if [ -d "${llvm17_path}" ]; then
    echo "llvm version is 17.0.0"
    llvm_path=${llvm17_path}
fi
if [ -d "${llvm18_path}" ]; then
    echo "llvm version is 18"
    llvm_path=${llvm18_path}
fi

src_path=$(dirname "$(realpath $0)")

lijian6's avatar
lijian6 committed
24
25
26
if [ ! -d "build_" ]; then
    mkdir -p build_
fi
lijian6's avatar
lijian6 committed
27

lijian6's avatar
lijian6 committed
28
29
PYTHON_INCLUDE=$(python3 -c "from sysconfig import get_paths; print(get_paths()['include'])")
PYTHON_PLATLIB=$(python3 -c "from sysconfig import get_paths; print(get_paths()['platlib'])")
lijian6's avatar
lijian6 committed
30

lijian6's avatar
lijian6 committed
31
32
USE_NVSHMEM=OFF
USE_ROCSHMEM=OFF
lijian6's avatar
lijian6 committed
33
ROCM_DISABLE_CTX=OFF
lishen's avatar
lishen committed
34
ROCM_DISABLE_MULTIQP=OFF
35
36
37
38
39
40
41
42
43
44
45
46
# 解析命令行参数
for arg in "$@"; do
    case $arg in
        rocshmem)
            USE_ROCSHMEM=ON
            ;;
        nvshmem|dushmem)
            USE_NVSHMEM=ON
            ;;
        ROCM_DISABLE_CTX=ON)
            ROCM_DISABLE_CTX=ON
            ;;
lishen's avatar
lishen committed
47
48
        ROCM_DISABLE_MULTIQP=ON)
            ROCM_DISABLE_MULTIQP=ON
49
50
            ;;
        *)
lishen's avatar
lishen committed
51
            echo "Usage: ./build.sh rocshmem [ROCM_DISABLE_CTX=ON] [ROCM_DISABLE_MULTIQP=ON] / ./build.sh dushmem"
52
53
54
55
56
            exit 1
            ;;
    esac
done

lishen's avatar
lishen committed
57
58
59
60
detect_offload_arch() {
    if command -v rocm_agent_enumerator >/dev/null 2>&1; then
        arch=$(rocm_agent_enumerator 2>/dev/null | grep -E '^gfx[0-9]+' | sort -r | head -n1)
        if [ -n "$arch" ]; then
lishen's avatar
lishen committed
61
            echo "--offload-arch=$arch"
lishen's avatar
lishen committed
62
63
64
65
66
            return 0
        fi
    fi
}
DETECTED_ARCH=$(detect_offload_arch)
lishen's avatar
lishen committed
67
echo "Current $DETECTED_ARCH"
lishen's avatar
lishen committed
68

lijian6's avatar
lijian6 committed
69
70
echo "USE_NVSHMEM=$USE_NVSHMEM"
echo "USE_ROCSHMEM=$USE_ROCSHMEM"
lijian6's avatar
lijian6 committed
71
echo "ROCM_DISABLE_CTX=$ROCM_DISABLE_CTX"
lishen's avatar
lishen committed
72
echo "ROCM_DISABLE_MULTIQP=$ROCM_DISABLE_MULTIQP"
73

lijian6's avatar
lijian6 committed
74
75
76
77
78
79
80
81
82
83
84
85
86
# -------------------------- With rocSHMEM -------------------------- #
build_rocshmem()
{
    cd third-party/rocshmem/
    if [ ! -d "build" ]; then
        mkdir -p build
    fi
    cd build || {
        echo "错误: 无法进入构建目录 '$build_dir'"
        cd "$src_path"
        return 1
    }
    echo "cd third-party/rocshmem/build"
lishen's avatar
lishen committed
87
    bash ../scripts/build_configs/gda_mlx5
lijian6's avatar
lijian6 committed
88
89
90
91
92
    echo "编译rocshmem成功"
    cd "$src_path"
}

if [ "$USE_ROCSHMEM" == "ON" ]; then
lijian6's avatar
lijian6 committed
93
94
95
    if [ ! -d "third-party/rocshmem/src/" ]; then
        echo "download submodule..."
        git submodule update --init third-party/rocshmem
lijian6's avatar
lijian6 committed
96
    fi
lijian6's avatar
lijian6 committed
97
98
99
100
101

    if [ ! -d "third-party/rocshmem_install" ]; then
        mkdir -p third-party/rocshmem_install
    fi

lijian6's avatar
lijian6 committed
102
    build_rocshmem
lijian6's avatar
lijian6 committed
103
    SHMEM_INSTALL_PREFIX=$(pwd)/third-party/rocshmem_install
lishen's avatar
lishen committed
104
    COMPILE_OPTIONS=${COMPILE_OPTIONS:= -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 ${DETECTED_ARCH}  -std=c++17 -Wno-return-type}
lijian6's avatar
lijian6 committed
105
    if [ "$ROCM_DISABLE_CTX" == "ON" ]; then
lijian6's avatar
lijian6 committed
106
107
        COMPILE_OPTIONS="-DROCM_DISABLE_CTX $COMPILE_OPTIONS"
    fi
lishen's avatar
lishen committed
108
109
    if [ "$ROCM_DISABLE_MULTIQP" == "ON" ]; then
        COMPILE_OPTIONS="-DROCM_DISABLE_MULTIQP $COMPILE_OPTIONS"
110
    fi
lijian6's avatar
lijian6 committed
111
112
113
114
    SHMEM_LINK_OPTIONS=${SHMEM_LINK_OPTIONS:="-Wl,-rpath,${SHMEM_INSTALL_PREFIX}/lib/ -l:librocshmem.a"}
fi
# -------------------------- rocSHMEM END -------------------------- #
# -------------------------- With duSHMEM -------------------------- #
lijian6's avatar
lijian6 committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
build_dushmem()
{
    cd third-party/dushmem-hip/
    source env.build.sh
    export CMAKE_PREFIX_PATH=${ROCM_PATH}/lib/cmake/amd_comgr:${ROCM_PATH}/lib64/cmake/amd_comgr:${CMAKE_PREFIX_PATH:-}
    export NVSHMEM_PREFIX=$src_path/third-party/dushmem_install
    if [ ! -d "build" ]; then
        mkdir -p build
    fi
    cd build || {
        echo "错误: 无法进入构建目录 '$build_dir'"
        cd "$src_path"
        return 1
    }
    echo "cd third-party/dushmem-hip/build"
    cmake ../
    make -j64
    make install
    echo "编译dushmem-hip成功"
    cd "$src_path"
}
136
if [ "$USE_NVSHMEM" == "ON" ]; then
lijian6's avatar
lijian6 committed
137
138
139
140
    # if [ ! -d "third-party/dushmem-hip/src/" ]; then
    #     echo "download submodule..."
    #     git submodule update --init third-party/dushmem-hip
    # fi
lijian6's avatar
lijian6 committed
141

lijian6's avatar
lijian6 committed
142
143
144
145
146
147
    # if [ ! -d "third-party/dushmem_install" ]; then
    #     mkdir -p third-party/dushmem_install
    # fi
    # build_dushmem
    # SHMEM_INSTALL_PREFIX=$(pwd)/third-party/dushmem_install
    SHMEM_INSTALL_PREFIX=${ROCM_PATH}/dushmem
lishen's avatar
lishen committed
148
    COMPILE_OPTIONS=${COMPILE_OPTIONS:= -fPIC -DFORCE_DUSHMEM_API -DHIP_ENABLE_WARP_SYNC_BUILTINS -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 ${DETECTED_ARCH} -std=c++17 -Wno-return-type}
lijian6's avatar
lijian6 committed
149
    SHMEM_LINK_OPTIONS="-Wl,-rpath,${SHMEM_INSTALL_PREFIX}/lib/ -l:libdushmem_device.a -ldushmem_host"
150
fi
lijian6's avatar
lijian6 committed
151
# -------------------------- duSHMEM END -------------------------- #
lishen's avatar
lishen committed
152

153
INCLUDE_PATHS=${INCLUDE_PATHS:=-Icsrc/ -I${SHMEM_INSTALL_PREFIX}/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE}}
154

lishen's avatar
lishen committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# 定义源文件列表(相对路径)
SOURCES=(
    "csrc/kernels/runtime.cu"
    "csrc/kernels/layout.cu"
    "csrc/kernels/intranode.cu"
    "csrc/kernels/internode.cu"
    "csrc/kernels/internode_ll.cu"
    "csrc/deep_ep.cu"
)

# 初始化对象文件列表
OBJECTS=()

# 编译每个源文件
for src in "${SOURCES[@]}"; do
    # 生成对应的 .o 文件名(保留目录结构或扁平化)
    obj="build_/$(basename "${src%.cu}.o")"
    OBJECTS+=("$obj")

    # 检查是否需要重新编译:条件:obj 不存在,或 src 比 obj 新
    if [[ ! -f "$obj" ]] || [[ "$src" -nt "$obj" ]]; then
        echo "Compiling $src -> $obj"
        hipcc ${INCLUDE_PATHS} -c "$src" -o "$obj" ${COMPILE_OPTIONS}
    else
        echo "Skipping $src (up to date)"
    fi
done

# 链接阶段
OUTPUT="deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so"

# 检查是否需要重新链接
need_link=false
if [[ ! -f "$OUTPUT" ]]; then
    need_link=true
else
    for obj in "${OBJECTS[@]}"; do
        if [[ "$obj" -nt "$OUTPUT" ]]; then
            need_link=true
            break
        fi
    done
fi
198

lishen's avatar
lishen committed
199
200
201
202
203
204
205
if [[ "$need_link" == true ]]; then
    echo "Linking -> $OUTPUT"
    hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions "${OBJECTS[@]}" -L${SHMEM_INSTALL_PREFIX}/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o "$OUTPUT" -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link ${DETECTED_ARCH} -shared -Wl,-soname,"$(basename "$OUTPUT")" -L"${llvm_path}/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so ${llvm_path}/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 ${SHMEM_LINK_OPTIONS} -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/opt/mpi/lib/ -libverbs -lmlx5
    echo "Successfully built $OUTPUT"
else
    echo "Skipping linking ($OUTPUT is up to date)"
fi
lijian6's avatar
lijian6 committed
206
207
208
209

# build whl
echo "Using Python: $(which python3)"
python3 --version
210
211
212
213
214
215
if [ "$USE_NVSHMEM" == "ON" ]; then
    python setup.py bdist_wheel --shmem=nv
fi
if [ "$USE_ROCSHMEM" == "ON" ]; then
    python setup.py bdist_wheel --shmem=rocm
fi
lijian6's avatar
lijian6 committed
216
217
echo "✅ Build complete:"
ls -lh dist/