Commit a09f6fda authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

init

parents
cmake_minimum_required(VERSION 3.12)
project(roctracer_python)
# 设置C++标准
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
# 查找Python
find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
# 设置路径
set(PYTHON_INCLUDE_DIR ${Python3_INCLUDE_DIRS})
set(PYBIND11_INCLUDE_DIR "/usr/local/lib/python3.10/dist-packages/torch/include")
set(HIP_INCLUDE_DIR "/opt/rocm/include") # 根据您的HIP安装路径调整
set(ROCTRACER_INCLUDE_DIR "/opt/dtk/include")
set(ROCTRACER_LIBRARY_PATH "/opt/dtk/lib/libroctx64.so")
# 检查文件存在性
if(NOT EXISTS ${PYBIND11_INCLUDE_DIR}/pybind11/pybind11.h)
message(FATAL_ERROR "pybind11 not found at: ${PYBIND11_INCLUDE_DIR}")
endif()
if(NOT EXISTS ${ROCTRACER_INCLUDE_DIR}/roctracer/roctx.h)
message(FATAL_ERROR "ROCTracer headers not found at: ${ROCTRACER_INCLUDE_DIR}")
endif()
if(NOT EXISTS ${ROCTRACER_LIBRARY_PATH})
message(FATAL_ERROR "ROCTracer library not found at: ${ROCTRACER_LIBRARY_PATH}")
endif()
# 包含目录
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/include
${PYTHON_INCLUDE_DIR}
${PYBIND11_INCLUDE_DIR}
${HIP_INCLUDE_DIR}
${ROCTRACER_INCLUDE_DIR}
)
# 创建C++库
add_library(roctracer_wrapper SHARED src/roctracer_wrapper.cpp)
# 添加编译定义
target_compile_definitions(roctracer_wrapper
PRIVATE
__HIP_PLATFORM_AMD__
ROCTRACER_WRAPPER_EXPORTS
)
# 链接库
target_link_libraries(roctracer_wrapper
${ROCTRACER_LIBRARY_PATH}
)
# 查找HIP库
find_library(HIP_LIBRARY hip_hcc PATHS /opt/rocm/lib)
if(HIP_LIBRARY)
target_link_libraries(roctracer_wrapper ${HIP_LIBRARY})
endif()
# 创建Python模块
add_library(roctracer_py MODULE python/roctracer_pybind.cpp)
# 设置模块属性
set_target_properties(roctracer_py PROPERTIES
OUTPUT_NAME "roctracer_py"
PREFIX ""
SUFFIX ".so"
)
# 添加编译定义
target_compile_definitions(roctracer_py
PRIVATE
__HIP_PLATFORM_AMD__
PYBIND11_MODULE_EXPORTS
)
# 链接库
target_link_libraries(roctracer_py
roctracer_wrapper
${ROCTRACER_LIBRARY_PATH}
${Python3_LIBRARIES}
)
if(HIP_LIBRARY)
target_link_libraries(roctracer_py ${HIP_LIBRARY})
endif()
# 获取Python扩展后缀
execute_process(
COMMAND ${Python3_EXECUTABLE} -c "
import sysconfig
import sys
suffix = sysconfig.get_config_var('EXT_SUFFIX')
if suffix is None:
suffix = sysconfig.get_config_var('SO')
sys.stdout.write(suffix if suffix else '.so')
"
OUTPUT_VARIABLE PYTHON_MODULE_EXTENSION
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "Python module extension: ${PYTHON_MODULE_EXTENSION}")
set_target_properties(roctracer_py PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}")
# 安装
install(TARGETS roctracer_py
LIBRARY DESTINATION lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages
)
\ No newline at end of file
# 工程介绍
roctracer_python 是基于DTK下的hiptx,即 roctracer 进行了python封装,实现了 python 代码中的指定代码段的性能标记,可以极大限度地提升性能分析的效率。
tests 目录下存放了测试代码,可以作为参考。
使用效果:
![show](./images/profiling.png)
roctracer_python/tests/test_roctracer.py 中进行了2处mark,在图中第一行清晰地展示出两个mark开始与结束。
- Memory Copy
- Model infer
编译使用方法:
```bash
mkdir build && cd build
cmake ..
make -j8
make install
cd ..
pythpn setup.py install
```
测试方法:
编译安装完成后,进行测试;
```bash
cd tests
hipprof --hip-trace --hiptx-hiptrace python3 test_roctracer.py
```
\ No newline at end of file
#ifndef ROCTRACER_WRAPPER_H
#define ROCTRACER_WRAPPER_H
#include <string>
#include <memory>
class RoctracerWrapper {
public:
RoctracerWrapper();
~RoctracerWrapper();
// 标记点
void mark(const std::string& message);
// 范围操作
int rangeStart(const std::string& message);
void rangePush(const std::string& message);
void rangePop();
void rangeStop(int id);
private:
class Impl;
std::unique_ptr<Impl> impl;
};
#endif // ROCTRACER_WRAPPER_H
\ No newline at end of file
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>
#include "roctracer_wrapper.h"
namespace py = pybind11;
PYBIND11_MODULE(roctracer_py, m) {
m.doc() = "Python bindings for ROCtracer with HIP support";
py::class_<RoctracerWrapper>(m, "RoctracerWrapper")
.def(py::init<>())
.def("mark", &RoctracerWrapper::mark, "Add a marker to the trace")
.def("range_start", &RoctracerWrapper::rangeStart, "Start a range")
.def("range_push", &RoctracerWrapper::rangePush, "Push a range")
.def("range_pop", &RoctracerWrapper::rangePop, "Pop a range")
.def("range_stop", &RoctracerWrapper::rangeStop, "Stop a range")
;
}
\ No newline at end of file
from setuptools import setup, find_packages
import os
import sys
import shutil
# 创建 roctracer_py 目录结构
package_dir = 'roctracer_py'
os.makedirs(package_dir, exist_ok=True)
# 创建 __init__.py
init_py = os.path.join(package_dir, '__init__.py')
if not os.path.exists(init_py):
with open(init_py, 'w') as f:
f.write('''"""
ROCtracer Python Bindings
"""
import os
import sys
import ctypes
# 首先加载依赖库
lib_path = os.path.join(os.path.dirname(__file__), 'libroctracer_wrapper.so')
if os.path.exists(lib_path):
ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
# 然后加载主模块
module_path = os.path.join(os.path.dirname(__file__), 'roctracer_py.cpython-310-x86_64-linux-gnu.so')
# 重命名 .so 文件以匹配模块名
target_so = os.path.join(os.path.dirname(__file__), 'roctracer_py.so')
if os.path.exists(module_path) and not os.path.exists(target_so):
os.symlink(module_path, target_so) # 或者使用 shutil.copy2
# 尝试导入
try:
from .roctracer_py import *
__all__ = ['roctracer_py']
except ImportError as e:
print(f"警告: 无法导入 roctracer_py: {e}")
print(f"确保 .so 文件在: {module_path}")
__version__ = "0.1.0"
''')
# 复制 .so 文件到包目录
so_files_to_copy = [
('build/roctracer_py.cpython-310-x86_64-linux-gnu.so',
os.path.join(package_dir, 'roctracer_py.so')),
('build/libroctracer_wrapper.so',
os.path.join(package_dir, 'libroctracer_wrapper.so'))
]
for src, dst in so_files_to_copy:
if os.path.exists(src) and not os.path.exists(dst):
print(f"复制 {src}{dst}")
shutil.copy2(src, dst)
setup(
name="roctracer_py",
version="0.1.0",
packages=[package_dir],
include_package_data=True,
package_data={
package_dir: ['*.so', '*.py'],
},
install_requires=[],
)
\ No newline at end of file
#include "roctracer_wrapper.h"
#include "hip/hip_runtime.h"
#include "roctracer/roctx.h"
#include <iostream>
#include <sstream>
class RoctracerWrapper::Impl {
public:
Impl() {
// 初始化HIP
hipError_t err = hipInit(0);
if (err != hipSuccess) {
std::cerr << "Failed to initialize HIP: " << hipGetErrorString(err) << std::endl;
}
}
~Impl() {
// 清理资源
}
void mark(const std::string& message) {
roctxMark(message.c_str());
}
int rangeStart(const std::string& message) {
return roctxRangeStart(message.c_str());
}
void rangePush(const std::string& message) {
roctxRangePush(message.c_str());
}
void rangePop() {
roctxRangePop();
}
void rangeStop(int id) {
roctxRangeStop(id);
}
};
RoctracerWrapper::RoctracerWrapper() : impl(std::make_unique<Impl>()) {}
RoctracerWrapper::~RoctracerWrapper() = default;
void RoctracerWrapper::mark(const std::string& message) { impl->mark(message); }
int RoctracerWrapper::rangeStart(const std::string& message) { return impl->rangeStart(message); }
void RoctracerWrapper::rangePush(const std::string& message) { impl->rangePush(message); }
void RoctracerWrapper::rangePop() { impl->rangePop(); }
void RoctracerWrapper::rangeStop(int id) { impl->rangeStop(id); }
\ No newline at end of file
rm *.json
rm *.db
rm *.csv
#include <iostream>
#include <vector>
#include "hip/hip_runtime.h"
#include "roctracer/roctx.h"
#include <chrono>
#include <thread>
// 定义矩阵大小和线程块配置
#define WIDTH 1024
#define NUM (WIDTH * WIDTH)
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32
// 矩阵转置内核函数
__global__ __launch_bounds__(1024) void matrixTranspose(float* transposedMatrix, const float* matrix, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < width) {
// 使用共享内存进行优化转置
__shared__ float tile[THREADS_PER_BLOCK_X][THREADS_PER_BLOCK_Y + 1]; // +1 避免bank conflict
// 从全局内存加载到共享内存(原始顺序)
tile[threadIdx.y][threadIdx.x] = matrix[y * width + x];
__syncthreads();
// 从共享内存写入到全局内存(转置顺序)
transposedMatrix[x * width + y] = tile[threadIdx.x][threadIdx.y];
}
}
int main() {
// 主机端矩阵
std::vector<float> Matrix(NUM);
std::vector<float> TransposeMatrix(NUM);
// 初始化矩阵
for (int i = 0; i < WIDTH; ++i) {
for (int j = 0; j < WIDTH; ++j) {
Matrix[i * WIDTH + j] = i * WIDTH + j;
}
}
// 设备端矩阵
float *gpuMatrix = nullptr;
float *gpuTransposeMatrix = nullptr;
hipMalloc(&gpuMatrix, NUM * sizeof(float));
// 分配设备内存
hipMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
// 初始化ROCTx
roctxMark("Program start");
int rangeId = roctxRangeStart("hipLaunchKernel range");
// 内存从主机传输到设备
hipMemcpy(gpuMatrix, Matrix.data(), NUM * sizeof(float), hipMemcpyHostToDevice);
roctxRangePush("hipMemcpy HostToDevice");
// 设置内核启动配置
dim3 gridSize(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y);
dim3 blockSize(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
for (int i = 0; i < 100; ++i){
std::this_thread::sleep_for(std::chrono::milliseconds(5));
// 启动转置内核
matrixTranspose<<<gridSize, blockSize>>>(gpuTransposeMatrix, gpuMatrix, WIDTH);
hipDeviceSynchronize();
}
// 检查内核启动错误
hipError_t kernelLaunchError = hipGetLastError();
if (kernelLaunchError != hipSuccess) {
std::cerr << "Kernel launch failed: " << hipGetErrorString(kernelLaunchError) << std::endl;
return -1;
}
roctxMark("after hipLaunchKernel");
// 等待内核执行完成
hipDeviceSynchronize();
// 检查内核执行错误
hipError_t kernelExecutionError = hipGetLastError();
if (kernelExecutionError != hipSuccess) {
std::cerr << "Kernel execution failed: " << hipGetErrorString(kernelExecutionError) << std::endl;
return -1;
}
// 内存从设备传输到主机
roctxRangePush("hipMemcpy DeviceToHost");
hipMemcpy(TransposeMatrix.data(), gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
roctxRangePop(); // for "hipMemcpy"
roctxRangePop(); // for "hipLaunchKernel"
roctxRangeStop(rangeId);
bool correct = true;
for (int i = 0; i < WIDTH && correct; ++i) {
for (int j = 0; j < WIDTH; ++j) {
if (TransposeMatrix[i * WIDTH + j] != Matrix[j * WIDTH + i]) {
std::cerr << "Mismatch at (" << i << ", " << j << "): "
<< TransposeMatrix[i * WIDTH + j] << " != " << Matrix[j * WIDTH + i] << std::endl;
correct = false;
break;
}
}
}
if (correct) {
std::cout << "Matrix transpose completed successfully!" << std::endl;
}
// 释放设备内存
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
return 0;
}
import roctracer_py
import numpy as np
import torch
from torchvision import models, transforms
# 创建wrapper实例
tracer = roctracer_py.RoctracerWrapper()
# 添加标记
tracer.mark("Program start")
# 开始一个范围
range_id = tracer.range_start("Computation")
# 内存拷贝(模拟)
tracer.range_push("Memory Copy")
model = models.resnet50(pretrained=False)
device = torch.device("cuda:0") # 指定设备为CPU
model.to(device)
tracer.range_pop()
input = torch.zeros((1, 3, 224, 224), dtype=torch.float).to(device)
tracer.range_push("Model infer")
with torch.no_grad():
outputs = model(input)
tracer.range_pop()
# 停止范围
tracer.range_stop(range_id)
print("Done!")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment