Commit 8f7de847 authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

dtk

parent f262efc9
Pipeline #248 failed with stages
in 0 seconds
# OneFlow # OneFlow
OneFlow is a deep learning framework designed to be **user-friendly, scalable and efficient**. With OneFlow, it is easy to: **OneFlow is a performance-centered and open-source deep learning framework.**
- program a model with **PyTorch-like API**
- scale a model to n-dimensional-parallel/distributed execution with the **Global View API**
- accelerate/deploy a model with the **Static Graph Compiler**.
[![Simple CI](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml/badge.svg)](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml) [![Simple CI](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml/badge.svg)](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml)
[![Nightly Docker Image](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml/badge.svg)](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml) [![Nightly Docker Image](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml/badge.svg)](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml)
...@@ -12,8 +9,10 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an ...@@ -12,8 +9,10 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
## Latest News ## Latest News
- Version 0.8.0 is out! - Version 0.7.0 is out!
- [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.8.0) - Introducing global tensor
- Semi-auto parallelization has landed
- [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.7.0)
## Publication ## Publication
...@@ -36,7 +35,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an ...@@ -36,7 +35,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
### System Requirements ### System Requirements
- Linux. As for now, there is no pre-built release for macOS, Windows. - Linux. As for now, there is no pre-built release for macOS, Windows.
- Python 3.7, 3.8, 3.9, 3.10 - Python 3.6, 3.7, 3.8, 3.9, 3.10
- (**Highly recommended**) Upgrade pip - (**Highly recommended**) Upgrade pip
``` ```
...@@ -54,7 +53,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an ...@@ -54,7 +53,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
- To install latest stable release of OneFlow with CUDA support: - To install latest stable release of OneFlow with CUDA support:
```bash ```bash
python3 -m pip install oneflow python3 -m pip install -f https://release.oneflow.info oneflow==0.7.0+cu102
``` ```
- To install nightly release of OneFlow with CUDA support: - To install nightly release of OneFlow with CUDA support:
...@@ -67,7 +66,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an ...@@ -67,7 +66,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
- Stable - Stable
```bash ```bash
python3 -m pip install --find-links https://release.oneflow.info oneflow==0.8.0+[PLATFORM] python3 -m pip install --find-links https://release.oneflow.info oneflow==0.7.0+[PLATFORM]
``` ```
- Nightly - Nightly
``` ```
......
# Monkey patch to not ship libjvm.so in pypi wheels
import sys
from auditwheel.main import main
from auditwheel.policy import _POLICIES as POLICIES
# libjvm is loaded dynamically; do not include it
for p in POLICIES:
p['lib_whitelist'].append('librccl.so.1')
p['lib_whitelist'].append('libhipblas.so.0')
p['lib_whitelist'].append('libhiprand.so.1')
p['lib_whitelist'].append('librocrand.so.1')
p['lib_whitelist'].append('libMIOpen.so.1')
p['lib_whitelist'].append('libgalaxyhip.so.4')
p['lib_whitelist'].append('librocm_smi64.so.2')
p['lib_whitelist'].append('librocsolver.so.0 ')
p['lib_whitelist'].append('librocblas.so.0')
if __name__ == "__main__":
sys.exit(main())
# Monkey patch to not ship libjvm.so in pypi wheels
import sys
from auditwheel.main import main
from auditwheel.policy import _POLICIES as POLICIES
# libjvm is loaded dynamically; do not include it
for p in POLICIES:
p['lib_whitelist'].append('librccl.so.1')
p['lib_whitelist'].append('libhipblas.so.0')
p['lib_whitelist'].append('libhiprand.so.1')
p['lib_whitelist'].append('librocrand.so.1')
p['lib_whitelist'].append('libMIOpen.so.1')
p['lib_whitelist'].append('libgalaxyhip.so.5')
p['lib_whitelist'].append('librocm_smi64.so.2')
p['lib_whitelist'].append('librocsolver.so.0 ')
p['lib_whitelist'].append('librocblas.so.0')
if __name__ == "__main__":
sys.exit(main())
...@@ -328,6 +328,17 @@ if(BUILD_PYTHON OR BUILD_CPP_API) ...@@ -328,6 +328,17 @@ if(BUILD_PYTHON OR BUILD_CPP_API)
endif() endif()
endif() endif()
if (BUILD_ROCM)
# AMD compiler fails to compile these three files with '-O1/2/3'.
# The value of `COMPILE_OPTIONS` target property is added after CMAKE_<LANG>_FLAGS_<CONFIG>,
# so '-O0' will override '-O1/2/3'.
set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel.hip.cpp
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel.hip.cpp
# ${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math.hip.cpp
PROPERTIES COMPILE_OPTIONS "-O0")
endif()
if(BUILD_PYTHON) if(BUILD_PYTHON)
# py ext lib # py ext lib
......
This diff is collapsed.
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#ifndef ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ #ifndef ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
#define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ #define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
#include <stdint.h> #include <stdint.h>
#include "oneflow/core/common/data_type.h" #include "oneflow/core/common/data_type.h"
namespace oneflow { namespace oneflow {
namespace embedding { namespace embedding {
namespace { namespace {
// From https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h // From https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
static const uint64_t PRIME64_1 = static const uint64_t PRIME64_1 =
0x9E3779B185EBCA87ULL; // 0b1001111000110111011110011011000110000101111010111100101010000111 0x9E3779B185EBCA87ULL; // 0b1001111000110111011110011011000110000101111010111100101010000111
static const uint64_t PRIME64_2 = static const uint64_t PRIME64_2 =
0xC2B2AE3D27D4EB4FULL; // 0b1100001010110010101011100011110100100111110101001110101101001111 0xC2B2AE3D27D4EB4FULL; // 0b1100001010110010101011100011110100100111110101001110101101001111
static const uint64_t PRIME64_3 = static const uint64_t PRIME64_3 =
0x165667B19E3779F9ULL; // 0b0001011001010110011001111011000110011110001101110111100111111001 0x165667B19E3779F9ULL; // 0b0001011001010110011001111011000110011110001101110111100111111001
static const uint64_t PRIME64_4 = static const uint64_t PRIME64_4 =
0x85EBCA77C2B2AE63ULL; // 0b1000010111101011110010100111011111000010101100101010111001100011 0x85EBCA77C2B2AE63ULL; // 0b1000010111101011110010100111011111000010101100101010111001100011
static const uint64_t PRIME64_5 = static const uint64_t PRIME64_5 =
0x27D4EB2F165667C5ULL; // 0b0010011111010100111010110010111100010110010101100110011111000101 0x27D4EB2F165667C5ULL; // 0b0010011111010100111010110010111100010110010101100110011111000101
#define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) { OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) {
acc += input * PRIME64_2; acc += input * PRIME64_2;
acc = XXH_rotl64(acc, 31); acc = XXH_rotl64(acc, 31);
acc *= PRIME64_1; acc *= PRIME64_1;
return acc; return acc;
} }
OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) { OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) {
uint64_t acc = seed + PRIME64_5; uint64_t acc = seed + PRIME64_5;
acc += sizeof(uint64_t); acc += sizeof(uint64_t);
acc = acc ^ XXH64_round(0, v); acc = acc ^ XXH64_round(0, v);
acc = XXH_rotl64(acc, 27) * PRIME64_1; acc = XXH_rotl64(acc, 27) * PRIME64_1;
acc = acc + PRIME64_4; acc = acc + PRIME64_4;
acc ^= (acc >> 33); acc ^= (acc >> 33);
acc = acc * PRIME64_2; acc = acc * PRIME64_2;
acc = acc ^ (acc >> 29); acc = acc ^ (acc >> 29);
acc = acc * PRIME64_3; acc = acc * PRIME64_3;
acc = acc ^ (acc >> 32); acc = acc ^ (acc >> 32);
return acc; return acc;
} }
static const size_t kShardingHashSeed = 1; static const size_t kShardingHashSeed = 1;
static const size_t kLocalUniqueHashSeed = 2; static const size_t kLocalUniqueHashSeed = 2;
static const size_t kGlobalUniqueHashSeed = 3; static const size_t kGlobalUniqueHashSeed = 3;
static const size_t kFullCacheHashSeed = 4; static const size_t kFullCacheHashSeed = 4;
static const size_t kLruCacheHashSeed = 5; static const size_t kLruCacheHashSeed = 5;
} // namespace } // namespace
struct ShardingHash { struct ShardingHash {
OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); } OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); }
OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); } OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); }
OF_DEVICE_FUNC size_t operator()(int32_t v) { OF_DEVICE_FUNC size_t operator()(int32_t v) {
return xxh64_uint64(static_cast<uint32_t>(v), kShardingHashSeed); return xxh64_uint64(static_cast<uint32_t>(v), kShardingHashSeed);
} }
OF_DEVICE_FUNC size_t operator()(int64_t v) { OF_DEVICE_FUNC size_t operator()(int64_t v) {
return xxh64_uint64(static_cast<uint64_t>(v), kShardingHashSeed); return xxh64_uint64(static_cast<uint64_t>(v), kShardingHashSeed);
} }
}; };
struct LocalUniqueHash { struct LocalUniqueHash {
OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); } OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); }
}; };
struct GlobalUniqueHash { struct GlobalUniqueHash {
OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); } OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); }
}; };
struct FullCacheHash { struct FullCacheHash {
OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); } OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); }
}; };
struct LruCacheHash { struct LruCacheHash {
OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); } OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); }
}; };
} // namespace embedding } // namespace embedding
} // namespace oneflow } // namespace oneflow
#endif // ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ #endif // ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
\ No newline at end of file
This diff is collapsed.
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "oneflow/core/ep/rocm/cuda_device.h" #include "oneflow/core/ep/rocm/cuda_device.h"
#include "oneflow/core/ep/rocm/cuda_event.h" #include "oneflow/core/ep/rocm/cuda_event.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
#ifdef WITH_ROCM #ifdef WITH_ROCM
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>
// #if CUDA_VERSION >= 11000 // #if CUDA_VERSION >= 11000
// #include <cuda_bf16.h> // #include <cuda_bf16.h>
// #endif // #endif
namespace oneflow { namespace oneflow {
namespace ep { namespace ep {
namespace { namespace {
constexpr size_t kDefaultConstBufElementCount = 1024 * 1024; constexpr size_t kDefaultConstBufElementCount = 1024 * 1024;
template<typename T> template<typename T>
void CreateConstBuffer(void** buf, T value, size_t n) { void CreateConstBuffer(void** buf, T value, size_t n) {
OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T))); OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T)));
std::vector<T> host(n, value); std::vector<T> host(n, value);
OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault)); OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault));
} }
} // namespace } // namespace
CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager) CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager)
: device_index_(device_index), : device_index_(device_index),
event_flags_{}, event_flags_{},
properties_{}, properties_{},
device_manager_(device_manager), device_manager_(device_manager),
const_buf_elem_cnt_(0), const_buf_elem_cnt_(0),
const_zeros_buffer_(nullptr), const_zeros_buffer_(nullptr),
const_ones_buffer_fp32_(nullptr), const_ones_buffer_fp32_(nullptr),
const_ones_buffer_fp16_(nullptr), const_ones_buffer_fp16_(nullptr),
const_ones_buffer_bf16_(nullptr) { const_ones_buffer_bf16_(nullptr) {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_)); OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_));
event_flags_ = hipEventDisableTiming; event_flags_ = hipEventDisableTiming;
if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) { if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) {
event_flags_ |= hipEventBlockingSync; event_flags_ |= hipEventBlockingSync;
} }
const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT", const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT",
kDefaultConstBufElementCount); kDefaultConstBufElementCount);
if (const_buf_elem_cnt_ > 0) { if (const_buf_elem_cnt_ > 0) {
CreateConstBuffer<float>(&const_zeros_buffer_, static_cast<float>(0), const_buf_elem_cnt_); CreateConstBuffer<float>(&const_zeros_buffer_, static_cast<float>(0), const_buf_elem_cnt_);
CreateConstBuffer<float>(&const_ones_buffer_fp32_, static_cast<float>(1.0), CreateConstBuffer<float>(&const_ones_buffer_fp32_, static_cast<float>(1.0),
const_buf_elem_cnt_); const_buf_elem_cnt_);
CreateConstBuffer<half>(&const_ones_buffer_fp16_, static_cast<half>(1.0), const_buf_elem_cnt_); CreateConstBuffer<half>(&const_ones_buffer_fp16_, static_cast<half>(1.0), const_buf_elem_cnt_);
// #if CUDA_VERSION >= 11000 // #if CUDA_VERSION >= 11000
// CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0), // CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0),
// const_buf_elem_cnt_); // const_buf_elem_cnt_);
// #endif // #endif
} }
} }
CudaDevice::~CudaDevice() { CudaDevice::~CudaDevice() {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
for (auto* event : events_) { delete event; } for (auto* event : events_) { delete event; }
OF_CUDA_CHECK(hipFree(const_zeros_buffer_)); OF_CUDA_CHECK(hipFree(const_zeros_buffer_));
OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_)); OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_));
OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_)); OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_));
OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_)); OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_));
} }
void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); } void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); }
Stream* CudaDevice::CreateStream() { Stream* CudaDevice::CreateStream() {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
return new CudaStream(this); return new CudaStream(this);
} }
void CudaDevice::DestroyStream(Stream* stream) { void CudaDevice::DestroyStream(Stream* stream) {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
delete stream; delete stream;
} }
void CudaDevice::CreateEvents(Event** events, size_t count) { void CudaDevice::CreateEvents(Event** events, size_t count) {
size_t copied = 0; size_t copied = 0;
{ {
std::lock_guard<std::mutex> lock(events_mutex_); std::lock_guard<std::mutex> lock(events_mutex_);
copied = std::min(count, events_.size()); copied = std::min(count, events_.size());
size_t offset = events_.size() - copied; size_t offset = events_.size() - copied;
std::copy(events_.begin() + offset, events_.end(), events); std::copy(events_.begin() + offset, events_.end(), events);
events_.resize(offset); events_.resize(offset);
} }
if (copied != count) { if (copied != count) {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); } for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); }
} }
} }
void CudaDevice::DestroyEvents(Event** events, size_t count) { void CudaDevice::DestroyEvents(Event** events, size_t count) {
std::lock_guard<std::mutex> lock(events_mutex_); std::lock_guard<std::mutex> lock(events_mutex_);
events_.insert(events_.end(), events, events + count); events_.insert(events_.end(), events, events + count);
} }
Maybe<void> CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) { Maybe<void> CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
CHECK(!options.HasPinnedDevice()); CHECK(!options.HasPinnedDevice());
hipError_t err = hipMalloc(ptr, size); hipError_t err = hipMalloc(ptr, size);
if (err != hipSuccess) { if (err != hipSuccess) {
return Error::RuntimeError() << hipGetErrorString(err); return Error::RuntimeError() << hipGetErrorString(err);
} else { } else {
return Maybe<void>::Ok(); return Maybe<void>::Ok();
} }
} }
void CudaDevice::Free(const AllocationOptions& attr, void* ptr) { void CudaDevice::Free(const AllocationOptions& attr, void* ptr) {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
OF_CUDA_CHECK(hipFree(ptr)); OF_CUDA_CHECK(hipFree(ptr));
} }
Maybe<void> CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) { Maybe<void> CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size); hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size);
if (err != hipSuccess) { if (err != hipSuccess) {
return Error::RuntimeError() << hipGetErrorString(err); return Error::RuntimeError() << hipGetErrorString(err);
} else { } else {
return Maybe<void>::Ok(); return Maybe<void>::Ok();
} }
} }
void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) { void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) {
CudaCurrentDeviceGuard guard(device_index_); CudaCurrentDeviceGuard guard(device_index_);
OF_CUDA_CHECK(hipHostFree(ptr)); OF_CUDA_CHECK(hipHostFree(ptr));
} }
const hipDeviceProp_t& CudaDevice::properties() const { return properties_; } const hipDeviceProp_t& CudaDevice::properties() const { return properties_; }
const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const { const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const {
if (GetSizeOfDataType(data_type) * n if (GetSizeOfDataType(data_type) * n
<= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) { <= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) {
return const_zeros_buffer_; return const_zeros_buffer_;
} else { } else {
return nullptr; return nullptr;
} }
} }
const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const { const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const {
if (n <= const_buf_elem_cnt_) { if (n <= const_buf_elem_cnt_) {
if (data_type == DataType::kFloat) { if (data_type == DataType::kFloat) {
return const_ones_buffer_fp32_; return const_ones_buffer_fp32_;
} else if (data_type == DataType::kFloat16) { } else if (data_type == DataType::kFloat16) {
return const_ones_buffer_fp16_; return const_ones_buffer_fp16_;
} else if (data_type == DataType::kBFloat16) { } else if (data_type == DataType::kBFloat16) {
return const_ones_buffer_bf16_; return const_ones_buffer_bf16_;
} else { } else {
return nullptr; return nullptr;
} }
} else { } else {
return nullptr; return nullptr;
} }
} }
} // namespace ep } // namespace ep
} // namespace oneflow } // namespace oneflow
#endif // WITH_ROCM #endif // WITH_ROCM
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ #ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ #define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
#include "oneflow/core/ep/include/device.h" #include "oneflow/core/ep/include/device.h"
#include "oneflow/core/common/data_type.h" #include "oneflow/core/common/data_type.h"
#ifdef WITH_ROCM #ifdef WITH_ROCM
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
namespace oneflow { namespace oneflow {
namespace ep { namespace ep {
class CudaDevice : public Device { class CudaDevice : public Device {
public: public:
OF_DISALLOW_COPY_AND_MOVE(CudaDevice); OF_DISALLOW_COPY_AND_MOVE(CudaDevice);
explicit CudaDevice(int device_index, DeviceManager* device_manager); explicit CudaDevice(int device_index, DeviceManager* device_manager);
~CudaDevice() override; ~CudaDevice() override;
void SetAsActiveDevice() override; void SetAsActiveDevice() override;
DeviceType device_type() const override { return DeviceType::kCUDA; } DeviceType device_type() const override { return DeviceType::kCUDA; }
size_t device_index() const override { return device_index_; } size_t device_index() const override { return device_index_; }
DeviceManager* device_manager() const override { return device_manager_; } DeviceManager* device_manager() const override { return device_manager_; }
Stream* CreateStream() override; Stream* CreateStream() override;
void DestroyStream(Stream* stream) override; void DestroyStream(Stream* stream) override;
void CreateEvents(Event** events, size_t count) override; void CreateEvents(Event** events, size_t count) override;
void DestroyEvents(Event** events, size_t count) override; void DestroyEvents(Event** events, size_t count) override;
Maybe<void> Alloc(const AllocationOptions& options, void** ptr, size_t size) override; Maybe<void> Alloc(const AllocationOptions& options, void** ptr, size_t size) override;
void Free(const AllocationOptions& options, void* ptr) override; void Free(const AllocationOptions& options, void* ptr) override;
Maybe<void> AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override; Maybe<void> AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override;
void FreePinned(const AllocationOptions& options, void* ptr) override; void FreePinned(const AllocationOptions& options, void* ptr) override;
const hipDeviceProp_t& properties() const; const hipDeviceProp_t& properties() const;
const void* GetConstZeros(DataType data_type, size_t n) const; const void* GetConstZeros(DataType data_type, size_t n) const;
const void* GetConstOnes(DataType data_type, size_t n) const; const void* GetConstOnes(DataType data_type, size_t n) const;
private: private:
int device_index_; int device_index_;
std::mutex events_mutex_; std::mutex events_mutex_;
std::vector<Event*> events_; std::vector<Event*> events_;
unsigned int event_flags_; unsigned int event_flags_;
hipDeviceProp_t properties_; hipDeviceProp_t properties_;
DeviceManager* device_manager_; DeviceManager* device_manager_;
int64_t const_buf_elem_cnt_; int64_t const_buf_elem_cnt_;
void* const_zeros_buffer_; void* const_zeros_buffer_;
void* const_ones_buffer_fp32_; void* const_ones_buffer_fp32_;
void* const_ones_buffer_fp16_; void* const_ones_buffer_fp16_;
void* const_ones_buffer_bf16_; void* const_ones_buffer_bf16_;
}; };
} // namespace ep } // namespace ep
} // namespace oneflow } // namespace oneflow
#endif // WITH_ROCM #endif // WITH_ROCM
#endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ #endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment