Unverified Commit 51761b3a authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.8.0 (#517)



**Description**

Cherry-pick bug fixes from v0.8.0 to main.

**Major Revisions**

* Monitor - Fix the cgroup version checking logic (#502)
* Benchmark - Fix matrix size overflow issue in cuBLASLt GEMM (#503)
* Fix wrong torch usage in communication wrapper for Distributed
Inference Benchmark (#505)
* Analyzer: Fix bug in python3.8 due to pandas api change (#504)
* Bug - Fix bug to get metric from cmd when error happens (#506)
* Monitor - Collect realtime GPU power when benchmarking (#507)
* Add num_workers argument in model benchmark (#511)
* Remove unreachable condition when write host list (#512)
* Update cuda11.8 image to cuda12.1 based on nvcr23.03 (#513)
* Doc - Fix wrong unit of cpu-memory-bw-latency in doc (#515)
* Docs - Upgrade version and release note (#508)
Co-authored-by: default avatarguoshzhao <guzhao@microsoft.com>
Co-authored-by: default avatarZiyue Yang <ziyyang@microsoft.com>
Co-authored-by: default avatarYuting Jiang <yutingjiang@microsoft.com>
parent 97c9a41f
...@@ -24,9 +24,9 @@ jobs: ...@@ -24,9 +24,9 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- name: cuda11.8 - name: cuda12.1
dockerfile: cuda11.8 dockerfile: cuda12.1
tags: superbench/main:cuda11.8 tags: superbench/main:cuda12.1
- name: cuda11.1.1 - name: cuda11.1.1
dockerfile: cuda11.1.1 dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest tags: superbench/main:cuda11.1.1,superbench/superbench:latest
......
FROM nvcr.io/nvidia/pytorch:22.12-py3 FROM nvcr.io/nvidia/pytorch:23.03-py3
# OS: # OS:
# - Ubuntu: 20.04 # - Ubuntu: 20.04
# - OpenMPI: 4.1.5a1 # - OpenMPI: 4.1.5a1
# - Docker Client: 20.10.8 # - Docker Client: 20.10.8
# NVIDIA: # NVIDIA:
# - CUDA: 11.8.0 # - CUDA: 12.1.0
# - cuDNN: 8.7.0.84 # - cuDNN: 8.8.1.3
# - NCCL: v2.15.5-1 # - NCCL: v2.17.1-1
# Mellanox: # Mellanox:
# - OFED: 5.2-2.2.3.0 # - OFED: 5.2-2.2.3.0 # TODO
# - HPC-X: v2.8.3 # - HPC-X: v2.14
# Intel: # Intel:
# - mlc: v3.9a # - mlc: v3.10
LABEL maintainer="SuperBench" LABEL maintainer="SuperBench"
...@@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \ ...@@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \
# Install OFED # Install OFED
ENV OFED_VERSION=5.2-2.2.3.0 ENV OFED_VERSION=5.2-2.2.3.0
RUN cd /tmp && \ RUN cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
# Install HPC-X # Install HPC-X
ENV HPCX_VERSION=v2.14
RUN cd /opt && \ RUN cd /opt && \
rm -rf hpcx && \ rm -rf hpcx && \
wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ tar xf hpcx.tbz && \
ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \ mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz rm hpcx.tbz
# Install Intel MLC # Install Intel MLC
RUN cd /tmp && \ RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \ wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \ tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \ cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz rm -rf ./Linux mlc.tgz
ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
# Install AOCC compiler # Install AOCC compiler
RUN cd /tmp && \ RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
...@@ -115,6 +105,18 @@ RUN cd /tmp && \ ...@@ -115,6 +105,18 @@ RUN cd /tmp && \
mv amd-blis /opt/AMD && \ mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz rm -rf aocl-blis-linux-aocc-4.0.tar.gz
ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
# Add config files # Add config files
ADD dockerfile/etc /opt/microsoft/ ADD dockerfile/etc /opt/microsoft/
......
...@@ -29,7 +29,7 @@ You need to [clone the code](./development.md#set-up) first before building the ...@@ -29,7 +29,7 @@ You need to [clone the code](./development.md#set-up) first before building the
export DOCKER_BUILDKIT=1 export DOCKER_BUILDKIT=1
docker buildx build \ docker buildx build \
--platform linux/amd64 --cache-to type=inline,mode=max \ --platform linux/amd64 --cache-to type=inline,mode=max \
--tag superbench-dev --file dockerfile/cuda11.1.1.dockerfile . --tag superbench-dev --file dockerfile/cuda12.1.dockerfile .
``` ```
</TabItem> </TabItem>
...@@ -39,7 +39,7 @@ docker buildx build \ ...@@ -39,7 +39,7 @@ docker buildx build \
export DOCKER_BUILDKIT=1 export DOCKER_BUILDKIT=1
docker buildx build \ docker buildx build \
--platform linux/amd64 --cache-to type=inline,mode=max \ --platform linux/amd64 --cache-to type=inline,mode=max \
--tag superbench-dev --file dockerfile/rocm4.2-pytorch1.7.0.dockerfile . --tag superbench-dev --file dockerfile/rocm5.1.x.dockerfile .
``` ```
</TabItem> </TabItem>
......
...@@ -45,7 +45,7 @@ but it is not strictly necessary. ...@@ -45,7 +45,7 @@ but it is not strictly necessary.
```bash ```bash
# create a new virtual environment # create a new virtual environment
python3 -m venv --system-site-packages ./venv python3 -m venv ./venv
# activate the virtual environment # activate the virtual environment
source ./venv/bin/activate source ./venv/bin/activate
...@@ -61,7 +61,7 @@ You can clone the source from GitHub and build it. ...@@ -61,7 +61,7 @@ You can clone the source from GitHub and build it.
:::note Note :::note Note
You should checkout corresponding tag to use release version, for example, You should checkout corresponding tag to use release version, for example,
`git clone -b v0.7.0 https://github.com/microsoft/superbenchmark` `git clone -b v0.8.0 https://github.com/microsoft/superbenchmark`
::: :::
```bash ```bash
......
...@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] ...@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
:::note Note :::note Note
You should deploy corresponding Docker image to use release version, for example, You should deploy corresponding Docker image to use release version, for example,
`sb deploy -f local.ini -i superbench/superbench:v0.7.0-cuda11.1.1` `sb deploy -f local.ini -i superbench/superbench:v0.8.0-cuda12.1`
You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone. You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone.
......
...@@ -70,7 +70,7 @@ superbench: ...@@ -70,7 +70,7 @@ superbench:
<TabItem value='example'> <TabItem value='example'>
```yaml ```yaml
version: v0.7 version: v0.8
superbench: superbench:
enable: benchmark_1 enable: benchmark_1
monitor: monitor:
......
...@@ -181,7 +181,7 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer ...@@ -181,7 +181,7 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer
#### Metrics #### Metrics
| Name | Unit | Description | | Name | Unit | Description |
|---------------------|--------------------|----------------------------------------------------------------------------| |--------------------|--------------------|---------------------------------------------------------------------------|
| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). | | cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). |
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. | | cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. | | cpu-hpl/time | time (s) | Time elapsed during HPL run. |
...@@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/ ...@@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
| Name | Unit | Description | | Name | Unit | Description |
|-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------| |-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------|
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth. | | cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth. |
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us) | Former NUMA to latter NUMA memory latency. | | cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns) | Former NUMA to latter NUMA memory latency. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read. | | cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. | | cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. | | cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. | | cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. | | cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
### `mem-bw` ### `mem-bw`
......
...@@ -29,6 +29,8 @@ available tags are listed below for all stable versions. ...@@ -29,6 +29,8 @@ available tags are listed below for all stable versions.
| Tag | Description | | Tag | Description |
|-------------------|------------------------------------| |-------------------|------------------------------------|
| v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 |
| v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 |
| v0.7.0-cuda11.8 | SuperBench v0.7.0 with CUDA 11.8 | | v0.7.0-cuda11.8 | SuperBench v0.7.0 with CUDA 11.8 |
| v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 | | v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 |
| v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 | | v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 |
...@@ -43,6 +45,10 @@ available tags are listed below for all stable versions. ...@@ -43,6 +45,10 @@ available tags are listed below for all stable versions.
| Tag | Description | | Tag | Description |
|-------------------------------|--------------------------------------------------| |-------------------------------|--------------------------------------------------|
| v0.8.0-rocm5.1.3 | SuperBench v0.8.0 with ROCm 5.1.3 |
| v0.8.0-rocm5.1.1 | SuperBench v0.8.0 with ROCm 5.1.1 |
| v0.8.0-rocm5.0.1 | SuperBench v0.8.0 with ROCm 5.0.1 |
| v0.8.0-rocm5.0 | SuperBench v0.8.0 with ROCm 5.0 |
| v0.7.0-rocm5.1.3 | SuperBench v0.7.0 with ROCm 5.1.3 | | v0.7.0-rocm5.1.3 | SuperBench v0.7.0 with ROCm 5.1.3 |
| v0.7.0-rocm5.1.1 | SuperBench v0.7.0 with ROCm 5.1.1 | | v0.7.0-rocm5.1.1 | SuperBench v0.7.0 with ROCm 5.1.1 |
| v0.7.0-rocm5.0.1 | SuperBench v0.7.0 with ROCm 5.0.1 | | v0.7.0-rocm5.0.1 | SuperBench v0.7.0 with ROCm 5.0.1 |
......
...@@ -65,7 +65,7 @@ superbench: ...@@ -65,7 +65,7 @@ superbench:
example: example:
```yaml ```yaml
# SuperBench rules # SuperBench rules
version: v0.7 version: v0.8
superbench: superbench:
rules: rules:
failure-rule: failure-rule:
......
...@@ -58,7 +58,7 @@ superbench: ...@@ -58,7 +58,7 @@ superbench:
```yaml title="Example" ```yaml title="Example"
# SuperBench rules # SuperBench rules
version: v0.7 version: v0.8
superbench: superbench:
rules: rules:
kernel_launch: kernel_launch:
......
...@@ -6,5 +6,5 @@ ...@@ -6,5 +6,5 @@
Provide hardware and software benchmarks for AI systems. Provide hardware and software benchmarks for AI systems.
""" """
__version__ = '0.7.0' __version__ = '0.8.0'
__author__ = 'Microsoft' __author__ = 'Microsoft'
...@@ -31,11 +31,13 @@ def statistic(raw_data_df): ...@@ -31,11 +31,13 @@ def statistic(raw_data_df):
logger.warning('DataAnalyzer: empty data.') logger.warning('DataAnalyzer: empty data.')
return data_statistics_df return data_statistics_df
try: try:
raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
raw_data_df = raw_data_df.dropna(axis=1, how='all')
data_statistics_df = raw_data_df.describe() data_statistics_df = raw_data_df.describe()
data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01) data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01, numeric_only=True)
data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05) data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05, numeric_only=True)
data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95) data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95, numeric_only=True)
data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99) data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99, numeric_only=True)
statistics_error = [] statistics_error = []
for column in list(raw_data_df.columns): for column in list(raw_data_df.columns):
if column not in list(data_statistics_df.columns) and not raw_data_df[column].isnull().all(): if column not in list(data_statistics_df.columns) and not raw_data_df[column].isnull().all():
...@@ -122,6 +124,8 @@ def correlation(raw_data_df): ...@@ -122,6 +124,8 @@ def correlation(raw_data_df):
logger.warning('DataAnalyzer: empty data.') logger.warning('DataAnalyzer: empty data.')
return data_corr_df return data_corr_df
try: try:
raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
raw_data_df = raw_data_df.dropna(axis=1, how='all')
data_corr_df = raw_data_df.corr() data_corr_df = raw_data_df.corr()
statistics_error = [] statistics_error = []
for column in list(raw_data_df.columns): for column in list(raw_data_df.columns):
...@@ -181,6 +185,8 @@ def generate_baseline(raw_data_df, output_dir): ...@@ -181,6 +185,8 @@ def generate_baseline(raw_data_df, output_dir):
output_dir (str): the directory of output file output_dir (str): the directory of output file
""" """
try: try:
raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
raw_data_df = raw_data_df.dropna(axis=1, how='all')
if not isinstance(raw_data_df, pd.DataFrame): if not isinstance(raw_data_df, pd.DataFrame):
logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame') logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
return return
......
...@@ -285,7 +285,7 @@ def output_diagnosis_in_excel(self, raw_data_df, data_not_accept_df, output_path ...@@ -285,7 +285,7 @@ def output_diagnosis_in_excel(self, raw_data_df, data_not_accept_df, output_path
logger.log_and_raise(exception=IOError, msg='DataDiagnosis: excel_data_output - invalid file path.') logger.log_and_raise(exception=IOError, msg='DataDiagnosis: excel_data_output - invalid file path.')
file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data') file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data')
file_handler.output_excel_data_not_accept(writer, data_not_accept_df, rules) file_handler.output_excel_data_not_accept(writer, data_not_accept_df, rules)
writer.save() writer.close()
except Exception as e: except Exception as e:
logger.log_and_raise(exception=Exception, msg='DataDiagnosis: excel_data_output - {}'.format(str(e))) logger.log_and_raise(exception=Exception, msg='DataDiagnosis: excel_data_output - {}'.format(str(e)))
......
...@@ -117,7 +117,7 @@ def _merge_summary(self, summary): ...@@ -117,7 +117,7 @@ def _merge_summary(self, summary):
summary_df = pd.DataFrame() summary_df = pd.DataFrame()
for category in summary: for category in summary:
for i in range(len(summary[category])): for i in range(len(summary[category])):
summary_df = summary_df.append([summary[category][i]], ignore_index=True) summary_df = pd.concat([summary_df, pd.DataFrame([summary[category][i]])], ignore_index=True)
return summary_df return summary_df
def _generate_summary(self, round): def _generate_summary(self, round):
...@@ -217,7 +217,7 @@ def output_summary_in_excel(self, raw_data_df, summary, output_path): ...@@ -217,7 +217,7 @@ def output_summary_in_excel(self, raw_data_df, summary, output_path):
file_handler.merge_column_in_excel(worksheet, row, 1) file_handler.merge_column_in_excel(worksheet, row, 1)
else: else:
logger.error('ResultSummary: excel_data_output - summary is empty.') logger.error('ResultSummary: excel_data_output - summary is empty.')
writer.save() writer.close()
except Exception as e: except Exception as e:
logger.error('ResultSummary: excel_data_output - {}'.format(str(e))) logger.error('ResultSummary: excel_data_output - {}'.format(str(e)))
......
...@@ -88,20 +88,21 @@ template <typename T> cudaDataType_t get_datatype() { ...@@ -88,20 +88,21 @@ template <typename T> cudaDataType_t get_datatype() {
} }
template <typename Ta, typename Tb, typename Tout> template <typename Ta, typename Tb, typename Tout>
float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) { float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, int iter) {
// init matrix // init matrix
Ta *matrix_a = nullptr; Ta *matrix_a = nullptr;
Tb *matrix_b = nullptr; Tb *matrix_b = nullptr;
Tout *matrix_out = nullptr; Tout *matrix_out = nullptr;
cudaMalloc(&matrix_a, m * k * std::max(batch, 1) * sizeof(Ta)); batch = std::max<size_t>(batch, 1);
cudaMalloc(&matrix_b, k * n * std::max(batch, 1) * sizeof(Tb)); cudaMalloc(&matrix_a, m * k * batch * sizeof(Ta));
cudaMalloc(&matrix_out, m * n * std::max(batch, 1) * sizeof(Tout)); cudaMalloc(&matrix_b, k * n * batch * sizeof(Tb));
cudaMalloc(&matrix_out, m * n * batch * sizeof(Tout));
init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * std::max(batch, 1)); init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * batch);
init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * std::max(batch, 1)); init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * batch);
// init gemm // init gemm
int lda = k, ldb = k, ldd = m; size_t lda = k, ldb = k, ldd = m;
std::unique_ptr<cublasLtGemm> gemm = std::make_unique<cublasLtGemm>(); std::unique_ptr<cublasLtGemm> gemm = std::make_unique<cublasLtGemm>();
gemm->Init(); gemm->Init();
gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tout>(), gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tout>(),
......
...@@ -5,12 +5,12 @@ ...@@ -5,12 +5,12 @@
void cublasLtGemm::Init() { void cublasLtGemm::Init() {
cublasLtHandle_t handle; cublasLtHandle_t handle;
checkCublasStatus(cublasLtCreate(&handle)); CUBLAS_CHECK(cublasLtCreate(&handle));
handle_.reset(handle); handle_.reset(handle);
/* preference can be initialized without arguments */ /* preference can be initialized without arguments */
cublasLtMatmulPreference_t preference; cublasLtMatmulPreference_t preference;
checkCublasStatus(cublasLtMatmulPreferenceCreate(&preference)); CUBLAS_CHECK(cublasLtMatmulPreferenceCreate(&preference));
preference_.reset(preference); preference_.reset(preference);
} }
...@@ -24,32 +24,32 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l ...@@ -24,32 +24,32 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
// force c_type // force c_type
cudaDataType_t c_type = d_type; cudaDataType_t c_type = d_type;
// Create matrix descriptors. // Create matrix descriptors.
checkCublasStatus( CUBLAS_CHECK(
cublasLtMatrixLayoutCreate(&a_desc, a_type, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda)); cublasLtMatrixLayoutCreate(&a_desc, a_type, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda));
checkCublasStatus( CUBLAS_CHECK(
cublasLtMatrixLayoutCreate(&b_desc, b_type, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb)); cublasLtMatrixLayoutCreate(&b_desc, b_type, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb));
checkCublasStatus(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd)); CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd));
checkCublasStatus(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd)); CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd));
// strided batch gemm // strided batch gemm
if (batch > 0) { if (batch > 0) {
int64_t stridea = m * k, strideb = k * n, stridec = m * n, strided = m * n; int64_t stridea = m * k, strideb = k * n, stridec = m * n, strided = m * n;
checkCublasStatus( CUBLAS_CHECK(
cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
checkCublasStatus(cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
&stridea, sizeof(stridea))); sizeof(stridea)));
checkCublasStatus( CUBLAS_CHECK(
cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
checkCublasStatus(cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
&strideb, sizeof(strideb))); sizeof(strideb)));
checkCublasStatus( CUBLAS_CHECK(
cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
checkCublasStatus(cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
&stridec, sizeof(stridec))); sizeof(stridec)));
checkCublasStatus( CUBLAS_CHECK(
cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
checkCublasStatus(cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strided,
&strided, sizeof(strided))); sizeof(strided)));
} }
a_desc_.reset(a_desc); a_desc_.reset(a_desc);
b_desc_.reset(b_desc); b_desc_.reset(b_desc);
...@@ -64,7 +64,7 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l ...@@ -64,7 +64,7 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
gemm_compute_type = CUBLAS_COMPUTE_64F; gemm_compute_type = CUBLAS_COMPUTE_64F;
cublasLtMatmulDesc_t op_desc = nullptr; cublasLtMatmulDesc_t op_desc = nullptr;
checkCublasStatus(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F)); CUBLAS_CHECK(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F));
op_desc_.reset(op_desc); op_desc_.reset(op_desc);
if (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3) { if (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3) {
...@@ -73,33 +73,31 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l ...@@ -73,33 +73,31 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode)); cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode));
} }
checkCublasStatus( CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa))); CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
checkCublasStatus(
cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
if (a_scale_inverse != nullptr) { if (a_scale_inverse != nullptr) {
checkCublasStatus(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
&a_scale_inverse, sizeof(a_scale_inverse))); &a_scale_inverse, sizeof(a_scale_inverse)));
} }
if (b_scale_inverse != nullptr) { if (b_scale_inverse != nullptr) {
checkCublasStatus(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
&b_scale_inverse, sizeof(b_scale_inverse))); &b_scale_inverse, sizeof(b_scale_inverse)));
} }
checkCublasStatus( CUBLAS_CHECK(
cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
} }
size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_size) { size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_size) {
checkCublasStatus(cublasLtMatmulPreferenceSetAttribute(preference_.get(), CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference_.get(), CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&max_workspace_size, sizeof(max_workspace_size))); &max_workspace_size, sizeof(max_workspace_size)));
int found_algorithm_count = 0; int found_algorithm_count = 0;
std::vector<cublasLtMatmulHeuristicResult_t> results(max_algorithm_count); std::vector<cublasLtMatmulHeuristicResult_t> results(max_algorithm_count);
// Though we query all of possible algorithm, we will use the first later // Though we query all of possible algorithm, we will use the first later
checkCublasStatus(cublasLtMatmulAlgoGetHeuristic(handle_.get(), op_desc_.get(), a_desc_.get(), b_desc_.get(), CUBLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(handle_.get(), op_desc_.get(), a_desc_.get(), b_desc_.get(),
c_desc_.get(), d_desc_.get(), preference_.get(), c_desc_.get(), d_desc_.get(), preference_.get(), max_algorithm_count,
max_algorithm_count, results.data(), &found_algorithm_count)); results.data(), &found_algorithm_count));
if (found_algorithm_count == 0) { if (found_algorithm_count == 0) {
throw std::runtime_error("Unable to find any suitable algorithms"); throw std::runtime_error("Unable to find any suitable algorithms");
} }
...@@ -111,7 +109,7 @@ size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_ ...@@ -111,7 +109,7 @@ size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_
void cublasLtGemm::Execute(void *matrix_a, void *matrix_b, void *matrix_c, void *matrix_d, float alpha, float beta, void cublasLtGemm::Execute(void *matrix_a, void *matrix_b, void *matrix_c, void *matrix_d, float alpha, float beta,
void *workspace, size_t workspace_size, cudaStream_t stream) { void *workspace, size_t workspace_size, cudaStream_t stream) {
checkCublasStatus(cublasLtMatmul(handle_.get(), op_desc_.get(), static_cast<const void *>(&alpha), /* alpha */ CUBLAS_CHECK(cublasLtMatmul(handle_.get(), op_desc_.get(), static_cast<const void *>(&alpha), /* alpha */
matrix_a, /* A */ matrix_a, /* A */
a_desc_.get(), matrix_b, /* B */ a_desc_.get(), matrix_b, /* B */
b_desc_.get(), static_cast<const void *>(&beta), /* beta */ b_desc_.get(), static_cast<const void *>(&beta), /* beta */
......
...@@ -10,12 +10,14 @@ ...@@ -10,12 +10,14 @@
#include <cublasLt.h> #include <cublasLt.h>
inline void checkCublasStatus(cublasStatus_t status) { #define CUBLAS_CHECK(func) \
if (status != CUBLAS_STATUS_SUCCESS) { do { \
printf("cuBLAS API failed with status %s\n", cublasGetStatusString(status)); cublasStatus_t status = func; \
throw std::logic_error("cuBLAS API failed"); if (status != CUBLAS_STATUS_SUCCESS) { \
} printf("cuBLAS call %s failed at %s:%d '%s'\n", #func, __FILE__, __LINE__, cublasGetStatusString(status)); \
} exit(EXIT_FAILURE); \
} \
} while (0)
class cublasLtGemm { class cublasLtGemm {
public: public:
......
...@@ -408,23 +408,21 @@ def _process_raw_result(self, cmd_idx, raw_output): ...@@ -408,23 +408,21 @@ def _process_raw_result(self, cmd_idx, raw_output):
True if the raw output string is valid and result can be extracted. True if the raw output string is valid and result can be extracted.
""" """
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
metric = ''
try: try:
lines = raw_output.splitlines() lines = raw_output.splitlines()
metric = ''
cmd_config = json.loads(self._commands[cmd_idx].split('--config_json')[-1].replace(' ', '')[1:-1])
for key in sorted(cmd_config.keys()):
if 'name' in key:
metric = key + '_' + str(cmd_config[key]) + metric
else:
metric = metric + '_' + key + '_' + str(cmd_config[key])
metric = metric.replace(' ', '').replace(',', '_')
error = False error = False
raw_data = [] raw_data = []
for line in lines: for line in lines:
if '[function config]' in line:
metric = ''
metric_json_str = line[line.index('[function config]: ') +
len('[function config]: '):].replace(' ', '').replace(':', '_')[1:-1]
metric_list = metric_json_str.split(',')
for key in metric_list:
if 'name' in key:
metric = key + metric
else:
metric = metric + '_' + key
if '[raw_data]' in line: if '[raw_data]' in line:
raw_data = line[line.index('[raw_data]: ') + len('[raw_data]: '):] raw_data = line[line.index('[raw_data]: ') + len('[raw_data]: '):]
raw_data = raw_data.split(',') raw_data = raw_data.split(',')
......
...@@ -121,7 +121,7 @@ def __all_gather_wrapper(self, x): ...@@ -121,7 +121,7 @@ def __all_gather_wrapper(self, x):
Return: Return:
Tensor after all-gather. Tensor after all-gather.
""" """
output = torch.empty_like([x.shape[0] * self.num_ranks] + list(x.shape[1:])) output = torch.empty([x.shape[0] * self.num_ranks] + list(x.shape[1:]), dtype=x.dtype, device=x.device)
dist.all_gather_into_tensor(output, x) dist.all_gather_into_tensor(output, x)
return output return output
......
...@@ -78,6 +78,13 @@ def add_parser_arguments(self): ...@@ -78,6 +78,13 @@ def add_parser_arguments(self):
required=False, required=False,
help='The number of batch size.', help='The number of batch size.',
) )
self._parser.add_argument(
'--num_workers',
type=int,
default=8,
required=False,
help='Number of subprocesses to use for data loading.',
)
self._parser.add_argument( self._parser.add_argument(
'--precision', '--precision',
type=Precision, type=Precision,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment