initial cuda build

2f2cf35b · Chao Liu · d51b8158 · 2f2cf35b · 2f2cf35b · 2f2cf35b
Commit 2f2cf35b authored Oct 22, 2018 by Chao Liu
6 changed files
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
-add_executable(conv EXCLUDE_FROM_ALL conv.cpp)
+add_executable(conv EXCLUDE_FROM_ALL conv.cu)
 target_link_libraries(conv convolution)
--- a/driver/conv.cpp
+++ b/driver/conv.cpp
 #include <iostream>
+#include "nvToolsExt.h"
 #include "tensor.hpp"
+#include "device_tensor.cuh"
+#include "direct_convolution.cuh"

 template <typename T>
-void direct_convolution(const Tensor<T>& in,
-                        const Tensor<T>& wei,
-                        Tensor<T>& out,
-                        std::size_t num_thread)
+void host_convolution(const Tensor<T>& in,
+                      const Tensor<T>& wei,
+                      Tensor<T>& out,
+                      std::size_t num_thread)
 {
    auto f = [&](auto n, auto k, auto ho, auto wo) {
        double v = 0;
@@ -33,6 +36,35 @@ void direct_convolution(const Tensor<T>& in,
    f_par(num_thread);
 }

+template <class T>
+void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
+
+{
+    DeviceTensorDescriptor in_desc_device(in.mDesc);
+    DeviceTensorDescriptor wei_desc_device(wei.mDesc);
+    DeviceTensorDescriptor out_desc_device(out.mDesc);
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    dim3 block_dim(256, 1, 1);
+    dim3 grid_dim(1, 1, 1);
+    direct_convolution<T, 256>
+        <<<grid_dim, block_dim>>>(in_desc_device,
+                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
+                                  wei_desc_device,
+                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+                                  out_desc_device,
+                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
+
+    out_device_buf.FromDevice(out.mData.data());
+}
+
 template <class T>
 struct Generator
 {
@@ -46,9 +78,16 @@ struct Generator

 int main()
 {
+#if 0
    Tensor<float> in({3, 16, 128, 128});
    Tensor<float> wei({4, 16, 3, 3});
-    Tensor<float> out({3, 4, 126, 126});
+    Tensor<float> out_host({3, 4, 126, 126});
+#else
+    Tensor<float> in({1, 1, 4, 4});
+    Tensor<float> wei({1, 1, 3, 3});
+    Tensor<float> out_host({1, 1, 2, 2});
+#endif
+    Tensor<float> out_device = out_host;

    int num_thread = std::thread::hardware_concurrency();

@@ -57,11 +96,13 @@ int main()
    in.GenerateTensorValue(Generator<float>{}, num_thread);
    wei.GenerateTensorValue(Generator<float>{}, num_thread);

-    direct_convolution(in, wei, out, num_thread);
+    host_convolution(in, wei, out_host, num_thread);
+    device_convolution(in, wei, out_device);

    std::cout << __func__ << ": done" << std::endl;

    LogRange(std::cout, in.mData, ",") << std::endl;
    LogRange(std::cout, wei.mData, ",") << std::endl;
-    LogRange(std::cout, out.mData, ",") << std::endl;
+    LogRange(std::cout, out_host.mData, ",") << std::endl;
+    LogRange(std::cout, out_device.mData, ",") << std::endl;
 }
--- a/src/include/device_tensor.cuh
+++ b/src/include/device_tensor.cuh
+#pragma once
+#include "helper_cuda.h"
+#include "tensor.hpp"
+
+struct DeviceTensorDescriptor
+{
+    DeviceTensorDescriptor() = delete;
+
+    __host__ DeviceTensorDescriptor(const TensorDescriptor& host_desc)
+        : mDataType(host_desc.GetDataType()), mDim(host_desc.GetDimension())
+    {
+        std::size_t data_sz = host_desc.GetDataType() == DataType_t::Float ? 4 : 2;
+
+        checkCudaErrors(cudaMalloc(&mpLengths, data_sz * mDim));
+        checkCudaErrors(cudaMalloc(&mpStrides, data_sz * mDim));
+
+        checkCudaErrors(
+            cudaMemcpy(const_cast<void*>(static_cast<const void*>(host_desc.GetLengths().data())),
+                       mpLengths,
+                       data_sz * mDim,
+                       cudaMemcpyHostToDevice));
+        checkCudaErrors(
+            cudaMemcpy(const_cast<void*>(static_cast<const void*>(host_desc.GetStrides().data())),
+                       mpStrides,
+                       data_sz * mDim,
+                       cudaMemcpyHostToDevice));
+    }
+
+    __host__ ~DeviceTensorDescriptor()
+    {
+        checkCudaErrors(cudaFree(mpLengths));
+        checkCudaErrors(cudaFree(mpStrides));
+    }
+
+    DataType_t mDataType;
+    unsigned long mDim;
+    unsigned long* mpLengths;
+    unsigned long* mpStrides;
+};
--- a/src/include/direct_convolution.cuh
+++ b/src/include/direct_convolution.cuh
+#pragma once
+#include "device_tensor.cuh"
+
+template <class TFloat, int NBlockDim>
+__global__ void direct_convolution(DeviceTensorDescriptor in_desc,
+                                   TFloat* const in,
+                                   DeviceTensorDescriptor wei_desc,
+                                   TFloat* const wei,
+                                   DeviceTensorDescriptor out_desc,
+                                   TFloat* out)
+{
+}
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
+#pragma once
 #include <thread>
 #include <vector>
 #include <numeric>
@@ -89,6 +90,7 @@ struct TensorDescriptor
    {
    }

+    DataType_t GetDataType() const;
    std::size_t GetDimension() const;
    std::size_t GetElementSize() const;
    std::size_t GetElementSpace() const;
@@ -105,35 +107,36 @@ struct TensorDescriptor
    }

    private:
+    DataType_t mDataType;
    std::vector<std::size_t> mLens;
    std::vector<std::size_t> mStrides;
-
-    DataType_t mDataType;
 };

-struct GpuMem
+struct DeviceMem
 {
-    GpuMem() = delete;
-    GpuMem(std::size_t size, std::size_t data_size) : mSize(size), mDataSize(data_size)
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
    {
-        cudaMalloc(static_cast<void**>(&mGpuBuf), mDataSize * mSize);
+        cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize);
    }

-    int ToGpu(void* p)
+    void* GetDeviceBuffer() { return mpDeviceBuf; }
+
+    int ToDevice(const void* p)
    {
-        return static_cast<int>(cudaMemcpy(mGpuBuf, p, mDataSize * mSize, cudaMemcpyHostToDevice));
+        return static_cast<int>(
+            cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
    }

-    int FromGpu(void* p)
+    int FromDevice(void* p)
    {
-        return static_cast<int>(cudaMemcpy(p, mGpuBuf, mDataSize * mSize, cudaMemcpyDeviceToHost));
+        return static_cast<int>(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
    }

-    ~GpuMem() { cudaFree(mGpuBuf); }
+    ~DeviceMem() { cudaFree(mpDeviceBuf); }

-    void* mGpuBuf;
-    std::size_t mSize;
-    std::size_t mDataSize;
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
 };

 struct joinable_thread : std::thread

--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -28,6 +28,8 @@ void TensorDescriptor::CalculateStrides()
        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
 }

+DataType_t TensorDescriptor::GetDataType() const { return mDataType; }
+
 std::size_t TensorDescriptor::GetDimension() const { return mLens.size(); }

 std::size_t TensorDescriptor::GetElementSize() const