minimal fix Windows compilation issues

Jamezo97 · wkpark · commit 8af17eb8ae9b · 2023-11-16T09:08:43.000+09:00
manually cherry-picked from PR #788 and cleanup Signed-off-by: Won-Kyu Park <wkpark@gmail.com>
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
@@ -1,5 +1,9 @@
 #include <BinSearch.h>
+#ifdef _WIN32
+#include <thread>
+#else
 #include <pthread.h>
+#endif
 #include <common.h>
 
 using namespace BinSearch;
@@ -31,7 +35,11 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
     for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
     {
       long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
+#ifdef _WIN32
+      std::thread *threads = (std::thread *) malloc(sizeof(std::thread) * valid_chunks);
+#else
       pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
+#endif
 
       struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *));
 
@@ -55,14 +63,23 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
           arg->threadidx = block_idx / blocksize;
           arg->blocksize = blocksize;
 
+#ifdef _WIN32
+          new (&threads[chunks_processed]) std::thread(quantize_block, arg);
+#else
           pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg);
+#endif
           chunks_processed += 1;
           if(chunks_processed == valid_chunks){ break; }
       }
 
       for (int i = 0; i < valid_chunks; i++)
+      {
+#ifdef _WIN32
+          threads[i].join();
+#else
           int err = pthread_join(threads[i], NULL);
-
+#endif
+      }
       free(threads);
       for (int i = 0; i < valid_chunks; i++)
           free(args[i]);
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -3816,12 +3816,12 @@ template __global__ void kgemm_4bit_inference_naive<float, 128, 32>(int M, int N
 template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 
-template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
 
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
@@ -9,7 +9,6 @@
 
 #include <stdio.h>
 #include <iostream>
-#include <unistd.h>
 #include <assert.h>
 
 #include <cuda_runtime_api.h>
diff --git a/include/Type.h b/include/Type.h
@@ -201,13 +201,30 @@ struct CondData<T,false>
     FORCE_INLINE operator const T() const { return 0;}
 };
 
+#ifdef _WIN32
+// The `IsComplete` buildtime check doesn't work on Windows
+// Given the usage of the BinAlgoBase class, `I != Scalar` should be equivalent to the unix
+// equivalent below of `Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value`
+template <InstrSet I, typename T, Algos A>
+struct WouldAlgoVecBaseBeComplete
+{
+    static constexpr bool value{I != Scalar};
+};
+#else
+template <InstrSet I, typename T, Algos A>
+struct WouldAlgoVecBaseBeComplete : public Details::IsComplete<Details::AlgoVecBase<I, T, A>>
+{
+
+};
+#endif
+
 template <InstrSet I, typename T, Algos A, bool L=false>
-struct BinAlgoBase : Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value
+struct BinAlgoBase : Details::conditional< WouldAlgoVecBaseBeComplete<I, T, A>::value
                                  , Details::AlgoVecBase<I, T, A>
                                  , Details::AlgoScalarToVec<T,A>
                                  >::type
 {
-    typedef typename Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value
+    typedef typename Details::conditional< WouldAlgoVecBaseBeComplete<I, T, A>::value
                                  , Details::AlgoVecBase<I, T, A>
                                  , Details::AlgoScalarToVec<T,A>
                                  >::type base_t;