Reduce diff

575aa698 · Max Ryabinin · 4d1d5b56 · 575aa698 · 575aa698
Commit 575aa698 authored Jul 01, 2022 by Max Ryabinin
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 26 deletions

csrc/ops.cu csrc/ops.cu +20 -25

csrc/pythonInterface.c csrc/pythonInterface.c +1 -1

No files found.
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -15,35 +15,30 @@ using namespace BinSearch;
 using std::cout;
 using std::endl;
-void histogramScatterAdd2D(float *histogram, int *index1, int *index2, float *src, int maxidx1, int n) {
+void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
-    int threads = 512;
+{
-    int blocks = n / threads;
+  int threads = 512;
-    blocks = n % threads == 0 ? blocks : blocks + 1;
+  int blocks = n/threads;
-    kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
+  blocks = n % threads == 0 ? blocks : blocks + 1;
-    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+  kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
-}
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
-template<typename T>
-void estimateQuantiles(T *A, float *code, float offset, int n) {
-    int blocks = n / 4096;
-    blocks = n % 4096 == 0 ? blocks : blocks + 1;
-    CUDA_CHECK_RETURN(cudaMemset(code, 0, 256 * sizeof(float)));
-    kEstimateQuantiles < T ><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
-    CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
-void quantize(float *code, float *A, unsigned char *out, int n) {
+template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
-    int blocks = n / 1024;
+{
-    blocks = n % 1024 == 0 ? blocks : blocks + 1;
+  int blocks = n/4096;
-    kQuantize<<<blocks, 1024>>>(code, A, out, n);
+  blocks = n % 4096 == 0 ? blocks : blocks + 1;
-    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+	CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float)));
+  kEstimateQuantiles<T><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
-void dequantize(float *code, unsigned char *A, float *out, int n) {
+void quantize(float *code, float *A, unsigned char *out, int n)
-    int blocks = n / 1024;
+{
-    blocks = n % 1024 == 0 ? blocks : blocks + 1;
+  int blocks = n/1024;
-    kDequantize<<<blocks, 1024>>>(code, A, out, n);
+  blocks = n % 1024 == 0 ? blocks : blocks + 1;
-    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+  kQuantize<<<blocks, 1024>>>(code, A, out, n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n)

--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -86,7 +86,7 @@ void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, floa
 extern "C"
 {
-    if #BUILD_CUDA
+    #if BUILD_CUDA
 	void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
 	void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
 	void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }