Fixed a bug in memcpy() for tensors where you weren't allowed to copy

alias tensors. Now any kind of tensors are supported.

Fixed a bug in memcpy() for tensors where you weren't allowed to copy
alias tensors. Now any kind of tensors are supported.
b09ddc3a · Davis King · fe42f662 · b09ddc3a · b09ddc3a · b09ddc3a
Commit b09ddc3a authored Aug 27, 2016 by Davis King
5 changed files
--- a/dlib/dnn/gpu_data.cpp
+++ b/dlib/dnn/gpu_data.cpp
@@ -24,20 +24,59 @@ namespace dlib
    )
    {
        DLIB_CASSERT(dest.size() == src.size(), "");
-        if (src.size() == 0)
+        if (src.size() == 0 || &dest == &src)
            return;

+        memcpy(dest,0, src, 0, src.size());
+    }
+
+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    )
+    {
+        DLIB_CASSERT(dest_offset + num <= dest.size(), "");
+        DLIB_CASSERT(src_offset + num <= src.size(), "");
+        if (num == 0)
+            return;
+
+        // if there is aliasing
+        if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
+        {
+            // if they perfectly alias each other then there is nothing to do
+            if (dest_offset == src_offset)
+                return;
+            else
+                std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+        else
+        {
+            // if we write to the entire thing then we can use device_write_only()
+            if (dest_offset == 0 && num == dest.size())
+            {
+                // copy the memory efficiently based on which copy is current in each object.
+                if (src.device_ready())
+                    CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset,  num*sizeof(float), cudaMemcpyDeviceToDevice));
+                else 
+                    CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset,    num*sizeof(float), cudaMemcpyHostToDevice));
+            }
+            else
+            {
                // copy the memory efficiently based on which copy is current in each object.
                if (dest.device_ready() && src.device_ready())
-            CHECK_CUDA(cudaMemcpy(dest.device(), src.device(),          src.size()*sizeof(float), cudaMemcpyDeviceToDevice));
+                    CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
                else if (!dest.device_ready() && src.device_ready())
-            CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.device(), src.size()*sizeof(float), cudaMemcpyDeviceToHost));
+                    CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset,   num*sizeof(float), cudaMemcpyDeviceToHost));
                else if (dest.device_ready() && !src.device_ready())
-            CHECK_CUDA(cudaMemcpy(dest.device(), src.host(),            src.size()*sizeof(float), cudaMemcpyHostToDevice));
+                    CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset,   num*sizeof(float), cudaMemcpyHostToDevice));
                else 
-            CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.host(),   src.size()*sizeof(float), cudaMemcpyHostToHost));
+                    CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset,     num*sizeof(float), cudaMemcpyHostToHost));
+            }
+        }
    }
-
 // ----------------------------------------------------------------------------------------

    void gpu_data::

--- a/dlib/dnn/gpu_data.h
+++ b/dlib/dnn/gpu_data.h
@@ -208,14 +208,54 @@ namespace dlib

 #ifdef DLIB_USE_CUDA
    void memcpy (gpu_data& dest, const gpu_data& src);
+
+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    );
+
 #else
+
    inline void memcpy (gpu_data& dest, const gpu_data& src)
    {
        DLIB_CASSERT(dest.size() == src.size(), "");
-        if (src.size() == 0)
+        if (src.size() == 0 || &dest == &src)
            return;
        std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size());
    }
+
+    inline void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    )
+    {
+        DLIB_CASSERT(dest_offset + num <= dest.size(), "");
+        DLIB_CASSERT(src_offset + num <= src.size(), "");
+        if (num == 0)
+            return;
+        if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
+        {
+            // if they perfectly alias each other then there is nothing to do
+            if (dest_offset == src_offset)
+                return;
+            else
+                std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+        else
+        {
+            // if we write to the entire thing then we can use host_write_only()
+            if (dest_offset == 0 && num == dest.size())
+                std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num);
+            else
+                std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+    }
 #endif

 // ----------------------------------------------------------------------------------------

--- a/dlib/dnn/gpu_data_abstract.h
+++ b/dlib/dnn/gpu_data_abstract.h
@@ -233,6 +233,31 @@ namespace dlib
            - This function blocks until the copy has completed.
    !*/

+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    );
+    /*!
+        requires
+            - dest_offset + num <= dest.size()
+            - src_offset  + num <= src.size()
+        ensures
+            - Copies the data in src to dest, but only copies data in the range
+              [src.host()+src_offset, src.host()+src_offset+num) to
+              [dest.host()+dest_offset, dest.host()+dest_offset+num).  Therefore, it is
+              just like the above memcpy() except that you can specify some subset of data
+              in a gpu_data object to be copied.
+            - Like the above version of memcpy(), the copy will happen in the most
+              efficient way, automatically using the appropriate type of host/device
+              transfers based on where data is currently resident. 
+            - It doesn't matter what GPU device is selected by cudaSetDevice().  You can
+              always copy gpu_data objects to and from each other regardless.
+            - This function blocks until the copy has completed.
+    !*/
+
 // ----------------------------------------------------------------------------------------

 }

--- a/dlib/dnn/tensor.h
+++ b/dlib/dnn/tensor.h
@@ -169,7 +169,10 @@ namespace dlib
            const tensor& src
        )
        {
-            memcpy(dest.data(), src.data());
+            DLIB_CASSERT(dest.size() == src.size(), "");
+            memcpy(dest.data(), dest.get_alias_offset(),  
+                   src.data(),  src.get_alias_offset(), 
+                   src.size());
        }



--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -497,6 +497,103 @@ namespace
 #endif
        }

+        {
+            resizable_tensor A, B;
+            A.set_size(11);
+            B.copy_size(A);
+
+            A = 4;
+            B = 1;
+            matrix<float> truth;
+
+
+            alias_tensor at(5);
+            A = 4;
+            A.host();
+            B.host();
+            {
+                // non-aliasing test
+                auto aA = at(A,5);
+                auto aB = at(B,5);
+                memcpy(aA, aB);
+                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+            {
+                // aliasing test
+                auto aA = at(A,1);
+                auto aB = at(A,6);
+                memcpy(aA, aB);
+                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+
+
+#ifdef DLIB_USE_CUDA
+            A = 4;
+            A.device();
+            B.host();
+            {
+                // non-aliasing test
+                auto aA = at(A,5);
+                auto aB = at(B,5);
+                memcpy(aA, aB);
+                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+            {
+                // aliasing test
+                auto aA = at(A,1);
+                auto aB = at(A,6);
+                memcpy(aA, aB);
+                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+
+
+            A = 4;
+            A.device();
+            B.device();
+            {
+                // non-aliasing test
+                auto aA = at(A,5);
+                auto aB = at(B,5);
+                memcpy(aA, aB);
+                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+            {
+                // aliasing test
+                auto aA = at(A,1);
+                auto aB = at(A,6);
+                memcpy(aA, aB);
+                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+
+            A = 4;
+            A.host();
+            B.device();
+            {
+                // non-aliasing test
+                auto aA = at(A,5);
+                auto aB = at(B,5);
+                memcpy(aA, aB);
+                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+            {
+                // aliasing test
+                auto aA = at(A,1);
+                auto aB = at(A,6);
+                memcpy(aA, aB);
+                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
+                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+            }
+
+#endif
+        }
+
        {
            resizable_tensor A, B;
            A.set_size(2,3,4,5);