torch.cpp 3.06 KB
Newer Older
1
/**
2
 *  Copyright (c) 2020-2022 by Contributors
3
4
 * @file torch/torch.cpp
 * @brief Implementation of PyTorch adapter library.
5
6
 */

7
#include <c10/core/CPUAllocator.h>
8
#include <tensoradapter_exports.h>
9
#ifdef DGL_USE_CUDA
10
#include <ATen/cuda/CUDAContext.h>
11
#include <ATen/cuda/CachingHostAllocator.h>
12
#include <c10/cuda/CUDACachingAllocator.h>
13
#include <c10/cuda/CUDAStream.h>
14
#include <cuda_runtime.h>
15
#endif  // DGL_USE_CUDA
16

17
18
19
20
namespace tensoradapter {

extern "C" {

21
22
23
24
25
26
TA_EXPORTS void* CPURawAlloc(size_t nbytes) {
  return c10::GetCPUAllocator()->raw_allocate(nbytes);
}

TA_EXPORTS void CPURawDelete(void* ptr) {
  c10::GetCPUAllocator()->raw_deallocate(ptr);
27
28
}

29
#ifdef DGL_USE_CUDA
30
TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) {
31
  at::globalContext().lazyInitCUDA();
32
  return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream);
33
34
}

35
TA_EXPORTS void CUDARawDelete(void* ptr) {
36
37
  c10::cuda::CUDACachingAllocator::raw_delete(ptr);
}
38
39
40
41

TA_EXPORTS cudaStream_t CUDACurrentStream() {
  return at::cuda::getCurrentCUDAStream();
}
42
43
44

TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) {
  c10::DataPtr data_ptr{
45
      ptr, ptr, c10::cuda::CUDACachingAllocator::get()->raw_deleter(),
46
      c10::Device(c10::DeviceType::CUDA, device_id)};
47
  c10::cuda::CUDACachingAllocator::recordStream(
48
49
50
51
52
53
54
55
56
      data_ptr,
      // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it
      // here
      c10::cuda::CUDAStream(
          c10::cuda::CUDAStream::UNCHECKED,
          c10::Stream(
              c10::Stream::UNSAFE,
              c10::Device(c10::DeviceType::CUDA, device_id),
              reinterpret_cast<int64_t>(stream))));
57
58
  data_ptr.release_context();
}
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

class CUDAHostDeleter {
 public:
  explicit CUDAHostDeleter(std::unique_ptr<void, c10::DeleterFnPtr> ptr)
      : ptr_(std::move(ptr)) {}

 private:
  std::unique_ptr<void, c10::DeleterFnPtr> ptr_;
};

TA_EXPORTS void* CUDARawHostAlloc(
    size_t nbytes, void** ctx, void** raw_deleter) {
  auto data_ptr = at::cuda::getCachingHostAllocator()->allocate(nbytes);
  auto raw = data_ptr.get();
  // Return the raw ctx ptr for recording event.
  *ctx = data_ptr.get_context();

  // Transfer ownership to raw_deleter.
  auto* data_deleter = new CUDAHostDeleter(data_ptr.move_context());
  *raw_deleter = static_cast<void*>(data_deleter);
  return raw;
}

// Designated CUDAHostDeleter for CUDARawHostAlloc.
TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
  delete static_cast<CUDAHostDeleter*>(*raw_deleter);
  *raw_deleter = nullptr;
}

TA_EXPORTS void CUDARecordHostAlloc(
    void* ptr, void* ctx, cudaStream_t stream, int device_id) {
  at::cuda::CachingHostAllocator_recordEvent(
      ptr, ctx,
      c10::cuda::CUDAStream(
          c10::cuda::CUDAStream::UNCHECKED,
          c10::Stream(
              c10::Stream::UNSAFE,
              c10::Device(c10::DeviceType::CUDA, device_id),
              reinterpret_cast<int64_t>(stream))));
}

TA_EXPORTS void CUDAHostAllocatorEmptyCache() {
  at::cuda::CachingHostAllocator_emptyCache();
}
103
#endif  // DGL_USE_CUDA
104
105
106
};

};  // namespace tensoradapter