Commit 43596386 authored by Po Yen Chen's avatar Po Yen Chen
Browse files

Merge branch 'feature/add-splitkv-instance' into...

Merge branch 'feature/add-splitkv-instance' into feature/support-vllm-kcache-layout-add-splitkv-instance
parents 250399cd af07d650
We'd love for you to contribute to our source code!
Some helpful links:
- [Code of Conduct guidelines](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.txt)
- [New issue guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/ISSUE_TEMPLATE.md)
- [Submitting a pull request guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/PULL_REQUEST_TEMPLATE.md)
- [Maintainers](https://github.com/rocm/composable_kernel/blob/develop/CONTRIBUTORS.md)
- [General information](https://github.com/rocm/composable_kernel/blob/develop/README.md)
- [ROCm documentation](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.html)
\ No newline at end of file
When creating an issue, please check if a similar issue already exists.
### When reporting a bug, please include:
- [ ] A descriptive title
- [ ] An isolated way to reproduce the behavior (preferably a docker container with a repro)
- [ ] ROCm version, clang version, Composable Kernel commit pin
- [ ] Environment variables
- [ ] The behavior you expect to see, and the behavior you actually see
### When requesting a feature, please include:
- [ ] A descriptive title
- [ ] A detailed description of the problem you are trying to solve
- [ ] An overview of the suggested solution
- [ ] Explanation why the solution is an improvement
\ No newline at end of file
## Proposed changes
Please describe the motivation behind the pull request, whether it enables a new feature or fixes a bug. If there are associated pull requests or issues, please link them to the pull request.
## Checklist
Please put an `x` into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask.
- [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally
- [ ] I have added inline documentation which enables the maintainers with understanding the motivation
- [ ] I have removed the stale documentation which is no longer relevant after this pull request
- [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request
- [ ] I have run `clang-format` on all changed files
- [ ] Any dependent changes have been merged
## Discussion
If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered
...@@ -185,13 +185,22 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9") ...@@ -185,13 +185,22 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
add_definitions(-DCK_USE_XDL) add_definitions(-DCK_USE_XDL)
endif() endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx94") if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
message("Enabling FP8 gemms in ckProfiler") message("Enabling FP8 gemms on native architectures")
add_definitions(-DCK_USE_GFX94) add_definitions(-DCK_USE_GFX94)
endif() endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
message("Enabling WMMA instances") message("Enabling WMMA instances")
add_definitions(-DCK_USE_WMMA) add_definitions(-DCK_USE_WMMA)
endif() endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
add_definitions(-DCK_USE_OCP_FP8)
set(CK_USE_OCP_FP8 "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94")
add_definitions(-DCK_USE_FNUZ_FP8)
set(CK_USE_FNUZ_FP8 "ON")
endif()
option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF) option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908")) if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH) add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
......
[Back to the main page](./README.md)
# Composable Kernel Developers and Contributors # Composable Kernel Developers and Contributors
This is the list of developers and contributors to Composable Kernel library This is the list of developers and contributors to Composable Kernel library
......
FROM ubuntu:20.04 FROM ubuntu:22.04
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=6.2 ARG ROCMVERSION=6.3
ARG compiler_version="" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
ARG CK_SCCACHE="" ARG CK_SCCACHE=""
...@@ -13,17 +13,12 @@ RUN set -xe && \ ...@@ -13,17 +13,12 @@ RUN set -xe && \
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \ apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN if [ "$ROCMVERSION" != "6.3" ]; then \ RUN if [ "$ROCMVERSION" != "6.4" ]; then \
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=2074281; \
fi fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \ RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
...@@ -53,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -53,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
libnuma-dev \ libnuma-dev \
libpthread-stubs0-dev \ libpthread-stubs0-dev \
llvm-amdgpu \ llvm-amdgpu \
mpich \
net-tools \ net-tools \
pkg-config \ pkg-config \
python \ python \
...@@ -68,6 +64,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -68,6 +64,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
nano \ nano \
zlib1g-dev \ zlib1g-dev \
zip \ zip \
libzstd-dev \
openssh-server \ openssh-server \
clang-format-12 \ clang-format-12 \
kmod && \ kmod && \
...@@ -75,7 +72,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -75,7 +72,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
rm -rf /var/lib/apt/lists/* && \ rm -rf /var/lib/apt/lists/* && \
rm -rf amdgpu-install* && \ rm -rf amdgpu-install* && \
# Remove unnecessary rocm components that take a lot of space # Remove unnecessary rocm components that take a lot of space
apt-get remove -y rocblas rocfft rocsparse composablekernel-dev apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
# Update the cmake to version 3.27.5 # Update the cmake to version 3.27.5
RUN pip install --upgrade cmake==3.27.5 && \ RUN pip install --upgrade cmake==3.27.5 && \
...@@ -97,7 +94,7 @@ RUN pip install --upgrade cmake==3.27.5 && \ ...@@ -97,7 +94,7 @@ RUN pip install --upgrade cmake==3.27.5 && \
dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \ dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
# Install packages for processing the performance results # Install packages for processing the performance results
pip3 install --upgrade pip && \ pip3 install --upgrade pip && \
pip3 install sqlalchemy==1.4.46 pymysql pandas==2.0.3 setuptools-rust sshtunnel==0.4.0 && \ pip3 install sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \
# Add render group # Add render group
groupadd -f render && \ groupadd -f render && \
# Install the new rocm-cmake version # Install the new rocm-cmake version
......
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2" ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
FROM $BASE_DOCKER FROM $BASE_DOCKER
ARG compiler_version="" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
......
...@@ -38,13 +38,14 @@ def getBaseDockerImageName(){ ...@@ -38,13 +38,14 @@ def getBaseDockerImageName(){
img = "${params.USE_CUSTOM_DOCKER}" img = "${params.USE_CUSTOM_DOCKER}"
} }
else{ else{
if (params.ROCMVERSION != "6.3"){ def ROCM_numeric = "${params.ROCMVERSION}" as float
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" if ( ROCM_numeric < 6.4 ){
} img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}"
else{ }
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" else{
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}"
}
} }
}
return img return img
} }
...@@ -329,10 +330,8 @@ def cmake_build(Map conf=[:]){ ...@@ -329,10 +330,8 @@ def cmake_build(Map conf=[:]){
try{ try{
archiveArtifacts "perf_fmha_fwd_*.log" archiveArtifacts "perf_fmha_fwd_*.log"
archiveArtifacts "perf_fmha_bwd_*.log" archiveArtifacts "perf_fmha_bwd_*.log"
stash name: "perf_fmha_fwd_gfx942.log" stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942"
stash name: "perf_fmha_bwd_gfx942.log" stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a"
stash name: "perf_fmha_fwd_gfx90a.log"
stash name: "perf_fmha_bwd_gfx90a.log"
} }
catch(Exception err){ catch(Exception err){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
...@@ -358,7 +357,7 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -358,7 +357,7 @@ def buildHipClangJob(Map conf=[:]){
def prefixpath = conf.get("prefixpath", "/opt/rocm") def prefixpath = conf.get("prefixpath", "/opt/rocm")
// Jenkins is complaining about the render group // Jenkins is complaining about the render group
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
...@@ -378,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -378,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 48, unit: 'HOURS') timeout(time: 20, unit: 'HOURS')
{ {
cmake_build(conf) cmake_build(conf)
} }
...@@ -407,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ ...@@ -407,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
} }
} }
def runCKProfiler(Map conf=[:]){
show_node_info()
env.HSA_ENABLE_SDMA=0
checkout scm
def image = getDockerImageName()
def prefixpath = conf.get("prefixpath", "/opt/rocm")
// Jenkins is complaining about the render group
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
}
def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
echo "Docker flags: ${dockerOpts}"
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
try {
(retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){
sh 'rocminfo | tee rocminfo.log'
if ( !runShell('grep -n "gfx" rocminfo.log') ){
throw new Exception ("GPU not found")
}
else{
echo "GPU is OK"
}
}
}
}
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
echo "The job was cancelled or aborted"
throw e
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 24, unit: 'HOURS')
{
sh """
rm -rf build
mkdir build
"""
dir("build"){
unstash 'ckProfiler.tar.gz'
sh 'tar -xvf ckProfiler.tar.gz'
}
dir("script"){
if (params.RUN_FULL_QA){
sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
archiveArtifacts "perf_batched_gemm.log"
archiveArtifacts "perf_grouped_gemm.log"
archiveArtifacts "perf_grouped_conv_fwd.log"
archiveArtifacts "perf_grouped_conv_bwd_data.log"
archiveArtifacts "perf_grouped_conv_bwd_weight.log"
archiveArtifacts "perf_gemm_bilinear.log"
archiveArtifacts "perf_reduction.log"
archiveArtifacts "perf_splitK_gemm.log"
archiveArtifacts "perf_onnx_gemm.log"
archiveArtifacts "perf_mixed_gemm.log"
// stash perf files to master
stash name: "perf_gemm.log"
stash name: "perf_resnet50_N256.log"
stash name: "perf_resnet50_N4.log"
stash name: "perf_batched_gemm.log"
stash name: "perf_grouped_gemm.log"
stash name: "perf_grouped_conv_fwd.log"
stash name: "perf_grouped_conv_bwd_data.log"
stash name: "perf_grouped_conv_bwd_weight.log"
stash name: "perf_gemm_bilinear.log"
stash name: "perf_reduction.log"
stash name: "perf_splitK_gemm.log"
stash name: "perf_onnx_gemm.log"
stash name: "perf_mixed_gemm.log"
//we will process results on the master node
}
else{
sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
// stash perf files to master
stash name: "perf_gemm.log"
stash name: "perf_resnet50_N256.log"
stash name: "perf_resnet50_N4.log"
//we will process the results on the master node
}
}
}
}
}
return retimage
}
def runPerfTest(Map conf=[:]){
try{
runCKProfiler(conf)
}
catch(e){
echo "throwing error exception in performance tests"
echo 'Exception occurred: ' + e.toString()
throw e
}
finally{
if (!conf.get("no_reboot", false)) {
reboot()
}
}
}
def Build_CK(Map conf=[:]){ def Build_CK(Map conf=[:]){
show_node_info() show_node_info()
...@@ -549,7 +426,7 @@ def Build_CK(Map conf=[:]){ ...@@ -549,7 +426,7 @@ def Build_CK(Map conf=[:]){
def prefixpath = conf.get("prefixpath", "/opt/rocm") def prefixpath = conf.get("prefixpath", "/opt/rocm")
// Jenkins is complaining about the render group // Jenkins is complaining about the render group
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
...@@ -572,7 +449,7 @@ def Build_CK(Map conf=[:]){ ...@@ -572,7 +449,7 @@ def Build_CK(Map conf=[:]){
try { try {
(retimage, image) = getDockerImage(conf) (retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 2, unit: 'MINUTES'){
sh 'rocminfo | tee rocminfo.log' sh 'rocminfo | tee rocminfo.log'
if ( !runShell('grep -n "gfx" rocminfo.log') ){ if ( !runShell('grep -n "gfx" rocminfo.log') ){
throw new Exception ("GPU not found") throw new Exception ("GPU not found")
...@@ -588,36 +465,95 @@ def Build_CK(Map conf=[:]){ ...@@ -588,36 +465,95 @@ def Build_CK(Map conf=[:]){
throw e throw e
} }
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 24, unit: 'HOURS') timeout(time: 20, unit: 'HOURS')
{ {
//check whether to run performance tests on this node //check whether to run performance tests on this node
def do_perf_tests = 0 def arch_type = 0
sh 'rocminfo | tee rocminfo.log' sh 'rocminfo | tee rocminfo.log'
if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){ if ( runShell('grep -n "gfx90a" rocminfo.log') ){
do_perf_tests = 1 arch_type = 1
echo "Stash profiler and run performance tests" }
else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
arch_type = 2
}
else if ( runShell('grep -n "gfx1030" rocminfo.log') ) {
arch_type = 3
}
else if ( runShell('grep -n "gfx1101" rocminfo.log') ) {
arch_type = 4
}
else if ( runShell('grep -n "gfx1201" rocminfo.log') ) {
arch_type = 5
} }
cmake_build(conf) cmake_build(conf)
dir("build"){ dir("build"){
//run tests and examples if (params.RUN_FULL_QA && arch_type == 1 ){
//sh 'make -j check' // build deb packages for all gfx9 targets on gfx90a system and prepare to export
if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){ echo "Build ckProfiler package"
//we only need the ckProfiler to run the performance tests, so we pack and stash it
//do not stash profiler on nodes where we don't need to run performance tests
sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
stash name: "ckProfiler.tar.gz"
}
if (params.RUN_FULL_QA && do_perf_tests == 0 ){
// build deb packages for all gfx9 targets and prepare to export
sh 'make -j package' sh 'make -j package'
archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb' sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
stash name: "ckprofiler_0.2.0_amd64.deb" stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb"
}
}
// run performance tests, stash the logs, results will be processed on the master node
dir("script"){
if (params.RUN_PERFORMANCE_TESTS){
if (params.RUN_FULL_QA && arch_type == 1){
// run full tests on gfx90a
echo "Run full performance tests"
sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
archiveArtifacts "perf_batched_gemm.log"
archiveArtifacts "perf_grouped_gemm.log"
archiveArtifacts "perf_grouped_conv_fwd.log"
archiveArtifacts "perf_grouped_conv_bwd_data.log"
archiveArtifacts "perf_grouped_conv_bwd_weight.log"
archiveArtifacts "perf_gemm_bilinear.log"
archiveArtifacts "perf_reduction.log"
archiveArtifacts "perf_splitK_gemm.log"
archiveArtifacts "perf_onnx_gemm.log"
archiveArtifacts "perf_mixed_gemm.log"
stash includes: "perf_**.log", name: "perf_log"
}
else if ( arch_type == 1 ){
// run standard tests on gfx90a
echo "Run performance tests"
sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts "perf_gemm.log"
archiveArtifacts "perf_onnx_gemm.log"
archiveArtifacts "perf_resnet50_N256.log"
archiveArtifacts "perf_resnet50_N4.log"
stash includes: "perf_**.log", name: "perf_log"
}
// disable performance tests on gfx1030 for now.
//else if ( arch_type == 3){
// run basic tests on gfx1030
// echo "Run gemm performance tests"
// sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10"
// archiveArtifacts "perf_onnx_gemm_gfx10.log"
// stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10"
//}
else if ( arch_type == 4){
// run basic tests on gfx11
echo "Run gemm performance tests"
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11"
archiveArtifacts "perf_onnx_gemm_gfx11.log"
stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11"
}
else if ( arch_type == 5 ){
// run basic tests on gfx12
echo "Run gemm performance tests"
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
archiveArtifacts "perf_onnx_gemm_gfx12.log"
stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
}
} }
} }
if (params.hipTensor_test && do_perf_tests == 0 ){ if (params.hipTensor_test && arch_type == 1 ){
//build and test hipTensor // build and test hipTensor on gfx90a node
sh """#!/bin/bash sh """#!/bin/bash
rm -rf "${params.hipTensor_branch}".zip rm -rf "${params.hipTensor_branch}".zip
rm -rf hipTensor-"${params.hipTensor_branch}" rm -rf hipTensor-"${params.hipTensor_branch}"
...@@ -630,11 +566,9 @@ def Build_CK(Map conf=[:]){ ...@@ -630,11 +566,9 @@ def Build_CK(Map conf=[:]){
ls -ltr ls -ltr
CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install" CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install"
cmake --build build -- -j cmake --build build -- -j
ctest --test-dir build
""" """
} }
dir("hipTensor-${params.hipTensor_branch}/build"){
sh 'ctest'
}
} }
} }
} }
...@@ -684,15 +618,13 @@ def process_results(Map conf=[:]){ ...@@ -684,15 +618,13 @@ def process_results(Map conf=[:]){
} }
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 1, unit: 'HOURS'){ timeout(time: 15, unit: 'MINUTES'){
try{ try{
dir("script"){ dir("script"){
if (params.RUN_CK_TILE_FMHA_TESTS){ if (params.RUN_CK_TILE_FMHA_TESTS){
try{ try{
unstash "perf_fmha_fwd_gfx942.log" unstash "perf_fmha_log_gfx942"
unstash "perf_fmha_bwd_gfx942.log" unstash "perf_fmha_log_gfx90a"
unstash "perf_fmha_fwd_gfx90a.log"
unstash "perf_fmha_bwd_gfx90a.log"
} }
catch(Exception err){ catch(Exception err){
echo "could not locate the FMHA performance logs: ${err.getMessage()}." echo "could not locate the FMHA performance logs: ${err.getMessage()}."
...@@ -702,26 +634,26 @@ def process_results(Map conf=[:]){ ...@@ -702,26 +634,26 @@ def process_results(Map conf=[:]){
// unstash perf files to master // unstash perf files to master
unstash "ckprofiler_0.2.0_amd64.deb" unstash "ckprofiler_0.2.0_amd64.deb"
sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/" sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
unstash "perf_gemm.log" unstash "perf_log"
unstash "perf_resnet50_N256.log" try{
unstash "perf_resnet50_N4.log" unstash "perf_log_gfx11"
unstash "perf_batched_gemm.log" unstash "perf_log_gfx12"
unstash "perf_grouped_gemm.log" }
unstash "perf_grouped_conv_fwd.log" catch(Exception err){
unstash "perf_grouped_conv_bwd_data.log" echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
unstash "perf_grouped_conv_bwd_weight.log" }
unstash "perf_gemm_bilinear.log"
unstash "perf_reduction.log"
unstash "perf_splitK_gemm.log"
unstash "perf_onnx_gemm.log"
unstash "perf_mixed_gemm.log"
sh "./process_qa_data.sh" sh "./process_qa_data.sh"
} }
else{ else{
// unstash perf files to master // unstash perf files to master
unstash "perf_gemm.log" unstash "perf_log"
unstash "perf_resnet50_N256.log" try{
unstash "perf_resnet50_N4.log" unstash "perf_log_gfx11"
unstash "perf_log_gfx12"
}
catch(Exception err){
echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
}
sh "./process_perf_data.sh" sh "./process_perf_data.sh"
} }
} }
...@@ -739,10 +671,10 @@ def process_results(Map conf=[:]){ ...@@ -739,10 +671,10 @@ def process_results(Map conf=[:]){
} }
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true 0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
0 13 * * * % BUILD_LEGACY_OS=true''' : "" 0 13 * * * % BUILD_LEGACY_OS=true''' : ""
...@@ -765,8 +697,8 @@ pipeline { ...@@ -765,8 +697,8 @@ pipeline {
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '6.2', defaultValue: '6.3',
description: 'Specify which ROCM version to use: 6.2 (default).') description: 'Specify which ROCM version to use: 6.3 (default).')
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: '', defaultValue: '',
...@@ -829,8 +761,8 @@ pipeline { ...@@ -829,8 +761,8 @@ pipeline {
description: "Test building instances for various architectures simultaneously (default: OFF)") description: "Test building instances for various architectures simultaneously (default: OFF)")
booleanParam( booleanParam(
name: "BUILD_GFX12", name: "BUILD_GFX12",
defaultValue: false, defaultValue: true,
description: "Build CK and run tests on gfx12 (default: OFF)") description: "Build CK and run tests on gfx12 (default: ON)")
booleanParam( booleanParam(
name: "NINJA_BUILD_TRACE", name: "NINJA_BUILD_TRACE",
defaultValue: false, defaultValue: false,
...@@ -1240,29 +1172,6 @@ pipeline { ...@@ -1240,29 +1172,6 @@ pipeline {
} }
} }
} }
stage("Performance Tests")
{
parallel
{
stage("Run ckProfiler: gfx90a")
{
when {
beforeAgent true
expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
}
options { retry(1) }
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
}
steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
cleanWs()
}
}
}
}
stage("Process Performance Test Results") stage("Process Performance Test Results")
{ {
parallel parallel
......
...@@ -26,23 +26,15 @@ The current CK library is structured into four layers: ...@@ -26,23 +26,15 @@ The current CK library is structured into four layers:
## General information ## General information
To build our documentation locally, use the following code: * [CK supported operations](include/ck/README.md)
* [CK Tile supported operations](include/ck_tile/README.md)
``` bash * [CK wrapper](client_example/25_wrapper/README.md)
cd docs * [CK codegen](codegen/README.md)
pip3 install -r sphinx/requirements.txt * [CK profiler](profiler/README.md)
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html * [Examples (Custom use of CK supported operations)](example/README.md)
``` * [Client examples (Use of CK supported operations with instance factory)](client_example/README.md)
* [Terminology](/TERMINOLOGY.md)
You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page. * [Contributors](/CONTRIBUTORS.md)
```note
If you use CK, cite us as follows:
* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
This paper will be available on arXiv soon.
* [CITATION.cff](/CITATION.cff)
```
CK is released under the **[MIT license](/LICENSE)**. CK is released under the **[MIT license](/LICENSE)**.
...@@ -137,6 +129,14 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa ...@@ -137,6 +129,14 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
You can find instructions for running ckProfiler in [profiler](/profiler). You can find instructions for running ckProfiler in [profiler](/profiler).
* Build our documentation locally:
``` bash
cd docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly. Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
crash. On average, you should expect each thread to use ~2Gb of RAM. crash. On average, you should expect each thread to use ~2Gb of RAM.
......
[Back to the main page](./README.md)
# Composable Kernel terminology
\ No newline at end of file
[Back to the main page](../../README.md)
# Composable Kernel wrapper GEMM tutorial # Composable Kernel wrapper GEMM tutorial
This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations.
wrapper. We present the base version of GEMM without most of the available optimizations; however,
it's worth noting that CK has kernels with different optimizations.
To implement these optimizations, you can use the CK wrapper or directly use available instances in To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
CK. You can also refer to the
[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
that uses CK wrapper based on the
[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
The kernel definition should look similar to: The kernel definition should look similar to:
......
...@@ -56,6 +56,14 @@ if (GPU_TARGETS) ...@@ -56,6 +56,14 @@ if (GPU_TARGETS)
add_definitions(-DCK_USE_WMMA) add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON") set(CK_USE_WMMA "ON")
endif() endif()
if (GPU_TARGETS MATCHES "gfx12")
add_definitions(-DCK_USE_OCP_FP8)
set(CK_USE_OCP_FP8 "ON")
endif()
if (GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94")
add_definitions(-DCK_USE_FNUZ_FP8)
set(CK_USE_FNUZ_FP8 "ON")
endif()
else() else()
add_definitions(-DCK_USE_WMMA -DCK_USE_XDL) add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
set(CK_USE_XDL "ON") set(CK_USE_XDL "ON")
......
[Back to the main page](../README.md)
# Composable Kernel client examples
## ##
Client application links to CK library, and therefore CK library needs to be installed before building client applications. Client application links to CK library, and therefore CK library needs to be installed before building client applications.
......
[Back to the main page](../README.md)
# Composable Kernel codegen
\ No newline at end of file
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <hip/hip_runtime_api.h> #include <hip/hip_runtime_api.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <stdexcept>
namespace rtc { namespace rtc {
......
rocm-docs-core==1.9.2 rocm-docs-core==1.11.0
sphinxcontrib-bibtex==2.6.3 sphinxcontrib-bibtex==2.6.3
...@@ -103,7 +103,7 @@ requests==2.32.3 ...@@ -103,7 +103,7 @@ requests==2.32.3
# via # via
# pygithub # pygithub
# sphinx # sphinx
rocm-docs-core==1.9.2 rocm-docs-core==1.11.0
# via -r requirements.in # via -r requirements.in
six==1.16.0 six==1.16.0
# via pybtex # via pybtex
......
...@@ -76,7 +76,7 @@ struct ProblemSizeSplitK final ...@@ -76,7 +76,7 @@ struct ProblemSizeSplitK final
struct ExecutionConfig final struct ExecutionConfig final
{ {
// 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
int do_verification = 3; int do_verification = 1;
int init_method = 2; int init_method = 2;
bool time_kernel = false; bool time_kernel = false;
}; };
......
...@@ -143,8 +143,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -143,8 +143,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
switch(config.init_method) switch(config.init_method)
{ {
case 0: case 0:
ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k); ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.f)}(a_m_k);
ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n); ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(1.f)}(b_k_n);
break; break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
......
...@@ -186,15 +186,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -186,15 +186,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
for(int j = 0; j < NumDMatrices; ++j) for(int j = 0; j < NumDMatrices; ++j)
{ {
d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
} }
break; break;
default: default:
a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
for(int j = 0; j < NumDMatrices; ++j) for(int j = 0; j < NumDMatrices; ++j)
{ {
d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<DDataType, 0>{});
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment