GPU性能优化算法

 

algorithms

common algorithms

reduce

目的

  • Reduce就是要对一个数组求 sum,min,max,avg 等等。
    • Reduce又被叫作规约,意思就是递归约减,最后获得的输出相比于输入一般维度上会递减。

baseline & version 1、2 main

  #include <cuda.h>
  #include <cuda_runtime.h>
  #include <time.h>

  #define N 32*1024*1024
  #define BLOCK_SIZE 256

  int main() {
    float *input_host = (float*)malloc(N*sizeof(float));
    float *input_device;
    cudaMalloc((void **)&input_device, N*sizeof(float));
    for (int i = 0; i < N; i++) input_host[i] = 2.0;
    cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

    int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    float *output_host = (float*)malloc((N / BLOCK_SIZE) * sizeof(float));
    float *output_device;
    cudaMalloc((void **)&output_device, (N / BLOCK_SIZE) * sizeof(float));
    
    dim3 grid(N / BLOCK_SIZE, 1);
    dim3 block(BLOCK_SIZE, 1);
    reduce_v0<<<grid, block>>>(input_device, output_device);
    cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    return 0;
  }

baseline

  __global__ void reduce_v0(float *g_idata,float *g_odata){
    __shared__ float sdata[BLOCK_SIZE];

    // each thread loads one element from global to shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
    sdata[tid] = g_idata[i];
    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=1; s < blockDim.x; s *= 2) {
        if (tid % (2*s) == 0) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
  }

version 2: interleaved addressing 目的

  • highly divergent warps are very inefficient
    • operator % is very slow

    code

    __global__ void reduce_v1(float *g_idata,float *g_odata){
      __shared__ float sdata[BLOCK_SIZE];
    
      // each thread loads one element from global to shared mem
      unsigned int tid = threadIdx.x;
      unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
      sdata[tid] = g_idata[i];
      __syncthreads();
    
      // do reduction in shared mem
      for(unsigned int s=1; s < blockDim.x; s *= 2) {
          // if (tid % (2*s) == 0) {
          //     sdata[tid] += sdata[tid + s];
          // }
          int index = 2 * s * tid;
          if (index < blockDim.x) {
              sdata[tid] += sdata[tid + s];
          }
          __syncthreads();
      }
    
      // write result for this block to global mem
      if (tid == 0) g_odata[blockIdx.x] = sdata[0];
    }
    

version 3: bank conflict free 目的

  • 解决shared memory conflict问题 code

      
    __global__ void reduce_v2(float *g_idata,float *g_odata){
      __shared__ float sdata[BLOCK_SIZE];
    
      // each thread loads one element from global to shared mem
      unsigned int tid = threadIdx.x;
      unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
      sdata[tid] = g_idata[i];
      __syncthreads();
    
      // do reduction in shared mem
      for(unsigned int s=blockDim.x/2; s>0; s >>= 1) {
          if (tid < s){
              sdata[tid] += sdata[tid + s];
          }
          __syncthreads();
      }
    
      // write result for this block to global mem
      if (tid == 0) g_odata[blockIdx.x] = sdata[0];
    }
    
    

version 4、5 main

 
  int main() {
    float *input_host = (float*)malloc(N*sizeof(float));
    float *input_device;
    cudaMalloc((void **)&input_device, N*sizeof(float));
    for (int i = 0; i < N; i++) input_host[i] = 2.0;
    cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

    // change /2
    int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE / 2;
    float *output_host = (float*)malloc((block_num) * sizeof(float));
    float *output_device;
    cudaMalloc((void **)&output_device, (block_num) * sizeof(float));
    
    dim3 grid(block_num, 1);
    dim3 block(BLOCK_SIZE, 1);
    reduce_v3<<<grid, block>>>(input_device, output_device);
    cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    return 0;
  }

version 4: idle thread free 目的

  • Half of the threads are idle on first loop iteration
    • use the threads properly
      • 将两次加载直接转化为一次相加的结果,减少threads浪费 code
      
    __global__ void reduce_v3(float *g_idata,float *g_odata){
      __shared__ float sdata[BLOCK_SIZE];
    
      // each thread loads one element from global to shared mem
      unsigned int tid = threadIdx.x;
      unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
      sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
      __syncthreads();
    
      // do reduction in shared mem
      for(unsigned int s=blockDim.x/2; s>0; s >>= 1) {
          if (tid < s){
              sdata[tid] += sdata[tid + s];
          }
          __syncthreads();
      }
    
      // write result for this block to global mem
      if (tid == 0) g_odata[blockIdx.x] = sdata[0];
    }
    

version 5: unroll last warp 目的

  • When s <= 32, we have only one warp left
    • We don’t need to __syncthreads()
      • We don’t need if (tid < s) statement code
      
    __device__ void warpReduce(volatile float* cache, unsigned int tid){
      cache[tid]+=cache[tid+32];
      cache[tid]+=cache[tid+16];
      cache[tid]+=cache[tid+8];
      cache[tid]+=cache[tid+4];
      cache[tid]+=cache[tid+2];
      cache[tid]+=cache[tid+1];
    }
    
    __global__ void reduce_v4(float *g_idata,float *g_odata){
      __shared__ float sdata[BLOCK_SIZE];
    
      // each thread loads one element from global to shared mem
      unsigned int tid = threadIdx.x;
      unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
      sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
      __syncthreads();
    
      // do reduction in shared mem
      for(unsigned int s=blockDim.x/2; s>32; s >>= 1) {
          if (tid < s){
              sdata[tid] += sdata[tid + s];
          }
          __syncthreads();
      }
    
      // write result for this block to global mem
      if (tid < 32) warpReduce(sdata, tid);
      if (tid == 0) g_odata[blockIdx.x] = sdata[0];
    }
    

version 6: complate unroll 目的

  • If we knew the number of iterations at compile time, we could completely unroll the reduction.
    • template to solve this problem code
    • 带template的main(在编译器完成解析)
     int main() {
        float *input_host = (float*)malloc(N*sizeof(float));
        float *input_device;
        cudaMalloc((void **)&input_device, N*sizeof(float));
        for (int i = 0; i < N; i++) input_host[i] = 2.0;
        cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);
    
        int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE / 2;
        float *output_host = (float*)malloc((block_num) * sizeof(float));
        float *output_device;
        cudaMalloc((void **)&output_device, (block_num) * sizeof(float));
          
        dim3 grid(block_num, 1);
        dim3 block(BLOCK_SIZE, 1);
        // with template
        reduce_v5<BLOCK_SIZE><<<grid, block>>>(input_device, output_device);
        cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
        return 0;
     }
    
    • 编译器解析后去掉if-else
    
     template <unsigned int blockSize>
     __device__ void warpReduce(volatile float* cache,int tid){
        if(blockSize >= 64)cache[tid]+=cache[tid+32];
        if(blockSize >= 32)cache[tid]+=cache[tid+16];
        if(blockSize >= 16)cache[tid]+=cache[tid+8];
        if(blockSize >= 8)cache[tid]+=cache[tid+4];
        if(blockSize >= 4)cache[tid]+=cache[tid+2];
        if(blockSize >= 2)cache[tid]+=cache[tid+1];
     }
    
     template <unsigned int blockSize>
     __global__ void reduce_v5(float *g_idata,float *g_odata){
        __shared__ float sdata[BLOCK_SIZE];
    
        // each thread loads one element from global to shared mem
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
        sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
        __syncthreads();
    
        // do reduction in shared mem
        if(blockSize>=512){
            if(tid<256){
                sdata[tid]+=sdata[tid+256];
            }
            __syncthreads();
        }
        if(blockSize>=256){
            if(tid<128){
                sdata[tid]+=sdata[tid+128];
            }
            __syncthreads();
        }
        if(blockSize>=128){
            if(tid<64){
                sdata[tid]+=sdata[tid+64];
            }
            __syncthreads();
        }
          
        // write result for this block to global mem
        if(tid<32)warpReduce<blockSize>(sdata,tid);
        if (tid == 0) g_odata[blockIdx.x] = sdata[0];
     }
    
    

version 7: multiple adds 目的

code

  • 将block数量固定,每个threads对多个数据处理main

    
     int main() {
        float *input_host = (float*)malloc(N*sizeof(float));
        float *input_device;
        cudaMalloc((void **)&input_device, N*sizeof(float));
        for (int i = 0; i < N; i++) input_host[i] = 2.0;
        cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);
    
        // 将block固定到1024
        const int block_num = 1024;
        const int NUM_PER_BLOCK = N / block_num;
    
        // 得到每个thread 需要处理的data数目
        const int NUM_PER_THREAD = NUM_PER_BLOCK / BLOCK_SIZE;
        float *output_host = (float*)malloc((block_num) * sizeof(float));
        float *output_device;
        cudaMalloc((void **)&output_device, (block_num) * sizeof(float));
          
        dim3 grid(block_num, 1);
        dim3 block(BLOCK_SIZE, 1);
        reduce_v6<BLOCK_SIZE ,NUM_PER_THREAD><<<grid, block>>>(input_device, output_device);
        cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
        return 0;
     }
    
        
     template <unsigned int blockSize>
     __device__ void warpReduce(volatile float* cache,int tid){
        if(blockSize >= 64)cache[tid]+=cache[tid+32];
        if(blockSize >= 32)cache[tid]+=cache[tid+16];
        if(blockSize >= 16)cache[tid]+=cache[tid+8];
        if(blockSize >= 8)cache[tid]+=cache[tid+4];
        if(blockSize >= 4)cache[tid]+=cache[tid+2];
        if(blockSize >= 2)cache[tid]+=cache[tid+1];
     }
    
     template <unsigned int blockSize, int NUM_PER_THREAD>
     __global__ void reduce_v6(float *g_idata,float *g_odata){
        __shared__ float sdata[BLOCK_SIZE];
    
        // each thread loads one element from global to shared mem
    
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x*(blockDim.x * NUM_PER_THREAD) + threadIdx.x;
        sdata[tid] = 0;
        // 将多个数据进行加载
        #pragma unroll
        for(int iter=0; iter<NUM_PER_THREAD; iter++){
            sdata[tid] += g_idata[i+iter*blockSize];
        }
        __syncthreads();
    
        // do reduction in shared mem
        if(blockSize>=512){
            if(tid<256){
                sdata[tid]+=sdata[tid+256];
            }
            __syncthreads();
        }
        if(blockSize>=256){
            if(tid<128){
                sdata[tid]+=sdata[tid+128];
            }
            __syncthreads();
        }
        if(blockSize>=128){
            if(tid<64){
                sdata[tid]+=sdata[tid+64];
            }
            __syncthreads();
        }
          
        // write result for this block to global mem
        if(tid<32)warpReduce<blockSize>(sdata,tid);
        if (tid == 0) g_odata[blockIdx.x] = sdata[0];
     }
    

elmmentwise

原来


  int main(){
    half *x_host = (half*)malloc(N*sizeof(half));
    half *x_device;
    cudaMalloc((void **)&x_device, N*sizeof(half));
    for (int i = 0; i < N; i++) x_host[i] = 2.0;
    cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *y_host = (half*)malloc(N*sizeof(half));
    half *y_device;
    cudaMalloc((void **)&y_device, N*sizeof(half));
    for (int i = 0; i < N; i++) y_host[i] = 2.0;
    cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *output_host = (half*)malloc(N * sizeof(half));
    half *output_device;
    cudaMalloc((void **)&output_device, N * sizeof(half));

    // naive elementwise
    int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
    dim3 grid(block_num, 1);
    dim3 block(kBlockSize, 1);
    mul<half><<<grid, block>>>(x_device, y_device, output_device);
    cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);

    // elementwise template
    Binary(MultiplyFunctor<half>(), N, output_device, x_device, y_device);
    cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);
    free(x_host);
    free(y_host);
    free(output_host);
    cudaFree(x_device);
    cudaFree(y_device);
    cudaFree(output_device);
    return 0;
  }
  
  template<typename T>
    __global__ void mul(T *x, T *y, T* z){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    z[idx] = x[idx] * y[idx];
  }

  template<>
    __global__ void mul(half *x, half *y, half* z){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    z[idx] = x[idx] * y[idx];
  }

优化


  int main(){
    half *x_host = (half*)malloc(N*sizeof(half));
    half *x_device;
    cudaMalloc((void **)&x_device, N*sizeof(half));
    for (int i = 0; i < N; i++) x_host[i] = 2.0;
    cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *y_host = (half*)malloc(N*sizeof(half));
    half *y_device;
    cudaMalloc((void **)&y_device, N*sizeof(half));
    for (int i = 0; i < N; i++) y_host[i] = 2.0;
    cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *output_host = (half*)malloc(N * sizeof(half));
    half *output_device;
    cudaMalloc((void **)&output_device, N * sizeof(half));


    int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
    dim3 grid(block_num, 1);
    dim3 block(kBlockSize, 1);

    // elementwise template
    Binary(MultiplyFunctor<half>(), N, output_device, x_device, y_device);
    cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);
    free(x_host);
    free(y_host);
    free(output_host);
    cudaFree(x_device);
    cudaFree(y_device);
    cudaFree(output_device);
    return 0;
  }
 
  template<typename T>
  constexpr int PackSize() {
    return Min(kMaxPackBytes / sizeof(T), kMaxPackSize);
  }

  template<typename T, typename U, typename... Args>
  constexpr int PackSize() {
    return Min(PackSize<T>(), PackSize<U, Args...>());
  }

  template<typename T>
  class HasApply2 {
    typedef char one;
    struct two {
      char x[2];
    };

    template<typename C>
    static one test(decltype(&C::Apply2));
    template<typename C>
    static two test(...);

  public:
    enum { value = sizeof(test<T>(0)) == sizeof(char) };
  };

  // 判断 Functor 是否支持两个两个的操作,比如 half2。如果 Functor 里定义了 apply2,那么 test 就会匹配到 one 函数,返回的是 sizeof char,value 是 true。比如 half 可以 Pack 一次 8 个,但是有 __half22float2 这种针对 half2 的操作,那它就可以两个两个的做。可以看到对于 half2 类型的 Element-Wise 操作我们需要给对应的 Functor 定义一个 Apply2 函数,比如对于 Cast 操作的 Functor 定义如下:
  template<int pack_size, typename FunctorT, typename R, typename... IN>
  __device__ typename std::enable_if<HasApply2<FunctorT>::value == true && pack_size % 2 == 0,
                                    Packed<R, pack_size>>::type
  ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
    Packed<R, pack_size> ret;
  #pragma unroll
    for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in.elem + j)...); }
    return ret;
  }

  template<int pack_size, typename FunctorT, typename R, typename... IN>
  __device__ typename std::enable_if<HasApply2<FunctorT>::value == false || pack_size % 2 != 0,
                                    Packed<R, pack_size>>::type
  ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
    Packed<R, pack_size> ret;
  #pragma unroll
    for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in.elem[j])...); }
    return ret;
  }

  template<int pack_size, typename FactoryT, typename R, typename... IN>
  __global__ void __launch_bounds__(kBlockSize)
      ApplyGeneric(FactoryT factory, int64_t n_pack, Packed<R, pack_size>* pack_r,
                  const Packed<IN, pack_size>*... pack_in, int64_t n_tail, R* tail_r,
                  const IN*... tail_in) {
    auto functor = factory();
    const int global_tid = blockIdx.x * kBlockSize + threadIdx.x;
    for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) {
      pack_r[i] = ApplyPack<pack_size, decltype(functor), R, IN...>(functor, (pack_in[i])...);
    }
    if (global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); }
  }

  template<typename FunctorT>
  struct SimpleFactory {
    explicit SimpleFactory(FunctorT functor) : tpl(functor) {}
    __device__ FunctorT operator()() const { return tpl; }

  private:
    FunctorT tpl;
  };

  template<size_t pack_size>
  bool IsAlignedForPack() {
    return true;
  }

  template<size_t pack_size, typename T, typename... Args>
  bool IsAlignedForPack(const T* ptr, const Args*... others) {
    return reinterpret_cast<uintptr_t>(ptr) % sizeof(Pack<T, pack_size>) == 0
          && IsAlignedForPack<pack_size, Args...>(others...);
  }

  template<size_t pack_size, typename FactoryT, typename R, typename... IN>
  cudaError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in) {
    const int64_t n_pack = n / pack_size;
    const int64_t tail_offset = n_pack * pack_size;
    const int64_t n_tail = n - tail_offset;
    int num_blocks;
    {
      cudaError_t err = GetNumBlocks(n_pack, &num_blocks);
      if (err != cudaSuccess) { return err; }
    }
    ApplyGeneric<pack_size, FactoryT, R, IN...><<<num_blocks, kBlockSize, 0>>>(
        factory, n_pack, reinterpret_cast<Packed<R, pack_size>*>(r),
        (reinterpret_cast<const Packed<IN, pack_size>*>(in))..., n_tail, r + tail_offset,
        (in + tail_offset)...);
    return cudaPeekAtLastError();
  }

  template<typename FactoryT, typename R, typename... IN>
  struct GenericLauncher {
    static cudaError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in) {
      constexpr int max_pack_size = PackSize<R, IN...>();
      if (IsAlignedForPack<max_pack_size, R, IN...>(r, in...)) {
        return LaunchKernel<max_pack_size, FactoryT, R, IN...>(factory, n, r, in...);
      } else {
        return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in...);
      }
    }
  };

  template<typename FactoryT, typename R, typename A>
  inline cudaError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a) {
    return GenericLauncher<FactoryT, R, A>::Launch(factory, n, r, a);
  }

  template<typename FunctorT, typename R, typename A>
  inline cudaError_t Unary(FunctorT functor, int64_t n, R* r, const A* a) {
    return UnaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a);
  }

  template<typename FactoryT, typename R, typename A, typename B>
  inline cudaError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b) {
    return GenericLauncher<FactoryT, R, A, B>::Launch(factory, n, r, a, b);
  }

  template<typename FunctorT, typename R, typename A, typename B>
  inline cudaError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b) {
    return BinaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b);
  }

  template<typename FactoryT, typename R, typename A, typename B, typename C>
  inline cudaError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
                                        const C* c) {
    return GenericLauncher<FactoryT, R, A, B, C>::Launch(factory, n, r, a, b, c);
  }

  template<typename FunctorT, typename R, typename A, typename B, typename C>
  inline cudaError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c) {
    return TernaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, c);
  }

  template<typename T>
  struct MultiplyFunctor {
    __device__ T operator()(T x, T y) const {
      return x*y;
    }
  };

  template<>
  struct MultiplyFunctor<half> {
    __device__ half operator()(half x, half y) const {
      return x*y;
    }
  #if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
    __device__ void Apply2(half* z, const half* x, const half* y) const {
      const half2 x2 = *(reinterpret_cast<const half2*>(x));
      const half2 y2 = *(reinterpret_cast<const half2*>(y));
      *reinterpret_cast<half2*>(z) = __hmul2(x2, y2);
    }
  #endif  // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
  };

fast atomic add

atomic add

  • main

     int main(){
        half *x_host = (half*)malloc(N*sizeof(half));
        half *x_device;
        cudaMalloc((void **)&x_device, N*sizeof(half));
        for (int i = 0; i < N; i++) x_host[i] = 0.1;
        cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);
    
        half *y_host = (half*)malloc(N*sizeof(half));
        half *y_device;
        cudaMalloc((void **)&y_device, N*sizeof(half));
        for (int i = 0; i < N; i++) y_host[i] = 0.1;
        cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);
    
        half *output_host = (half*)malloc(sizeof(half));
        half *output_device;
        cudaMalloc((void **)&output_device, sizeof(half));
        cudaMemset(&output_device, 0, sizeof(half));
    
        int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
        dim3 grid(block_num, 1);
        dim3 block(kBlockSize, 1);
        dot<<<grid, block>>>(x_device, y_device, output_device, N);
        cudaMemcpy(output_host, output_device, sizeof(half), cudaMemcpyDeviceToHost);
        printf("%.6f\n", static_cast<double>(output_host[0]));
        free(x_host);
        free(y_host);
        free(output_host);
        cudaFree(x_device);
        cudaFree(y_device);
        cudaFree(output_device);
     }
    
    • cuda
        
     __global__ void dot(half* a, half* b, half* c, int n){
        const int nStep = gridDim.x * blockDim.x;
        half temp = 0.0;
        int gid = blockIdx.x * blockDim.x + threadIdx.x;
        while (gid < n) {
            temp = temp + a[gid] * b[gid];
            gid += nStep;
        }
        atomicAdd(c, temp);
     }
    

half pack2

  • half类型的原子加转换成half2类型的原子加
    • main
     int main(){
        half *x_host = (half*)malloc(N*sizeof(half));
        half *x_device;
        cudaMalloc((void **)&x_device, N*sizeof(half));
        for (int i = 0; i < N; i++) x_host[i] = 1.0;
        cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);
        Pack<half, 2>* x_pack = reinterpret_cast<Pack<half, 2>*>(x_device);
    
        half *y_host = (half*)malloc(N*sizeof(half));
        half *y_device;
        cudaMalloc((void **)&y_device, N*sizeof(half));
        for (int i = 0; i < N; i++) y_host[i] = 1.0;
        cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);
        Pack<half, 2>* y_pack = reinterpret_cast<Pack<half, 2>*>(y_device);
    
        half *output_host = (half*)malloc(2 * sizeof(half));
        half *output_device;
        cudaMalloc((void **)&output_device, 2 * sizeof(half));
        cudaMemset(&output_device, 0, sizeof(half) * 2);
        Pack<half, 2>* output_pack = reinterpret_cast<Pack<half, 2>*>(output_device);
    
        int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
        dim3 grid(block_num, 1);
        dim3 block(kBlockSize, 1);
        dot<half, 2><<<grid, block>>>(x_pack, y_pack, output_pack, N);
        cudaMemcpy(output_host, output_device, 2 * sizeof(half), cudaMemcpyDeviceToHost);
        printf("%.6f\n", static_cast<double>(output_host[0]));
        free(x_host);
        free(y_host);
        free(output_host);
        cudaFree(x_device);
        cudaFree(y_device);
        cudaFree(output_device);
     }
    
    • pack2
     template<typename T, size_t pack_size>
     struct alignas(sizeof(T) * pack_size) Pack {
       T elem[pack_size];
     };
    
    • dot
        
     template<typename T, int32_t pack_size>
     __global__ void dot(Pack<T, pack_size>* a, Pack<T, pack_size>* b, Pack<T, pack_size>* c, int n){
        const int nStep = gridDim.x * blockDim.x;
        T temp = 0.0;
        int gid = blockIdx.x * blockDim.x + threadIdx.x;
        while (gid < n / pack_size) {
            for (int i = 0; i < pack_size; i++) {
                temp = temp + a[gid].elem[i] * b[gid].elem[i];
            }
            gid += nStep;
        }
        AtomicAdd<T, pack_size>(c, temp);
     }
    
    • AtomicAdd template和特化版本
    
     // 函数特化版本
     template<>
     __device__ __inline__ void AtomicAdd<half, 2>(Pack<half, 2>* address, half val) {
      half2 h2_val;
      h2_val.x = static_cast<half>(val);
      h2_val.y = static_cast<half>(val);
      atomicAdd(reinterpret_cast<half2*>(address), h2_val);
     }
    
     template<typename T, int32_t pack_size>
     __device__ __inline__ void AtomicAdd(Pack<T, pack_size>* address,
                                        T val) {
     #pragma unroll
      for (int i = 0; i < pack_size; ++i) {
        atomicAdd(reinterpret_cast<T*>(address) + i, static_cast<T>(val));
      }
     }
    

fast atomic add

  • main

    
     int main(){
        half *x_host = (half*)malloc(N*sizeof(half));
        half *x_device;
        cudaMalloc((void **)&x_device, N*sizeof(half));
        for (int i = 0; i < N; i++) x_host[i] = 0.1;
        cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);
    
        half *y_host = (half*)malloc(N*sizeof(half));
        half *y_device;
        cudaMalloc((void **)&y_device, N*sizeof(half));
        for (int i = 0; i < N; i++) y_host[i] = 0.1;
        cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);
    
        half *output_host = (half*)malloc(sizeof(half));
        half *output_device;
        cudaMalloc((void **)&output_device, sizeof(half));
        cudaMemset(&output_device, 0, sizeof(half));
    
        int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
        dim3 grid(block_num, 1);
        dim3 block(kBlockSize, 1);
        dot<<<grid, block>>>(x_device, y_device, output_device, N);
        cudaMemcpy(output_host, output_device, sizeof(half), cudaMemcpyDeviceToHost);
        printf("%.6f\n", static_cast<double>(output_host[0]));
        free(x_host);
        free(y_host);
        free(output_host);
        cudaFree(x_device);
        cudaFree(y_device);
        cudaFree(output_device);
     }
    
    
    • dot
        
     __global__ void dot(half* a, half* b, half* c, int n){
        const int nStep = gridDim.x * blockDim.x;
        half temp = 0.0;
        int gid = blockIdx.x * blockDim.x + threadIdx.x;
        while (gid < n) {
            temp = temp + a[gid] * b[gid];
            gid += nStep;
        }
        // atomicAdd(c, temp);
        FastAdd(c, 0, N, temp);
     }
    
    
    • 实现
       
     // FastAdd is referenced from
     // https://github.com/pytorch/pytorch/blob/396c3b1d88d7624938a2bb0b287f2a19f1e89bb4/aten/src/ATen/native/cuda/KernelUtils.cuh#L29
     // 如果T类型是half类型
     template<typename T, typename std::enable_if<std::is_same<half, T>::value>::type* = nullptr>
     __device__ __forceinline__ void FastSpecializedAtomicAdd(T* base, size_t offset,
                                                            const size_t length, T value) {
     #if ((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) \
        || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
      atomicAdd(reinterpret_cast<half*>(base) + offset, static_cast<half>(value));
     #else
      // Accounts for the chance base falls on an odd 16 bit alignment (ie, not 32 bit aligned)
      __half* target_addr = reinterpret_cast<__half*>(base + offset);
      bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
      if (low_byte && offset < (length - 1)) {
        __half2 value2;
        value2.x = value;
        value2.y = __float2half_rz(0);
        atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);
    
      } else if (!low_byte && offset > 0) {
        __half2 value2;
        value2.x = __float2half_rz(0);
        value2.y = value;
        atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);
    
      } else {
        atomicAdd(reinterpret_cast<__half*>(base) + offset, static_cast<__half>(value));
      }
     #endif
     }
     // 如果T类型不是half类型
     template<typename T, typename std::enable_if<!std::is_same<half, T>::value>::type* = nullptr>
     __device__ __forceinline__ void FastSpecializedAtomicAdd(T* base, size_t offset,
                                                            const size_t length, T value) {
      atomicAdd(base + offset, value);
     }
    
     template<class T>
     __device__ __forceinline__ void FastAdd(T* base, size_t offset, const size_t length, T value) {
      FastSpecializedAtomicAdd(base, offset, length, value);
     }
    
    

unsample nearest 2D

实现

    
  int main(){
      float *input_host = (float*)malloc(N*sizeof(float));
      float *input_device;
      cudaMalloc((void **)&input_device, N*sizeof(float));
      for (int i = 0; i < N; i++) input_host[i] = 1.0;
      cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

      float *output_host = (float*)malloc(N * 4 * sizeof(float));
      float *output_device;
      cudaMalloc((void **)&output_device, N * 4 * sizeof(float));
      
      dim3 grid(N / kBlockSize, 1);
      dim3 block(kBlockSize, 1);
      UpsampleNearest2D2XForward<<<grid, block>>>(N, input_device, 1024, 1024, output_device);
      cudaMemcpy(output_host, output_device, N * 4 * sizeof(float), cudaMemcpyDeviceToHost);
      for(int i = 0; i < 50; i++) {
          printf("%.5f\n", output_host[i]);
      }

      return 0;
   }
  #define CUDA_1D_KERNEL_LOOP(i, n)                                                                 \
    for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
       i += step)
  template<typename T>
  struct alignas(2 * sizeof(T)) Pack2X {
    T x;
    T y;
  };
  template<typename T>
  __global__ void UpsampleNearest2D2XForward(const int32_t in_elem_cnt, const T* in_dptr,
                                            const int32_t in_height, const int32_t in_width,
                                            T* out_dptr) {
    const int32_t in_hw_size = in_width * in_height;
    CUDA_1D_KERNEL_LOOP(index, in_elem_cnt) {
      const T in_value = in_dptr[index];
      const int32_t nc_idx = index / in_hw_size;
      const int32_t hw_off = index - nc_idx * in_hw_size;
      const int32_t h = hw_off / in_width;
      const int32_t w = hw_off - h * in_width;
      Pack2X<T> out_value{in_value, in_value};
      Pack2X<T>* out_pack_dptr = reinterpret_cast<Pack2X<T>*>(out_dptr);
      out_pack_dptr[nc_idx * in_hw_size * 2 + h * 2 * in_width + w] = out_value;
      out_pack_dptr[nc_idx * in_hw_size * 2 + (h * 2 + 1) * in_width + w] = out_value;
    }
  }

  template<typename T>
  __global__ void UpsampleNearest2D2XBackward(const int32_t in_elem_cnt, const T* dy_dptr,
                                              const int32_t dx_height, const int32_t dx_width,
                                              T* dx_dptr) {
    const int32_t dx_hw_size = dx_height * dx_width;
    CUDA_1D_KERNEL_LOOP(index, in_elem_cnt) {
      T dx_value = 0.0;
      const int32_t nc_idx = index / dx_hw_size;
      const int32_t dx_hw_off = index - nc_idx * dx_hw_size;
      const int32_t dx_h = dx_hw_off / dx_width;
      const int32_t dx_w = dx_hw_off - dx_h * dx_width;
      const Pack2X<T>* dy_pack_dptr = reinterpret_cast<const Pack2X<T>*>(dy_dptr);
      const Pack2X<T> dy_pack_value1 =
          dy_pack_dptr[nc_idx * dx_hw_size * 2 + dx_h * 2 * dx_width + dx_w];
      const Pack2X<T> dy_pack_value2 =
          dy_pack_dptr[nc_idx * dx_hw_size * 2 + (dx_h * 2 + 1) * dx_width + dx_w];
      dx_value += dy_pack_value1.x;
      dx_value += dy_pack_value1.y;
      dx_value += dy_pack_value2.x;
      dx_value += dy_pack_value2.y;
      dx_dptr[index] = dx_value;
    }
  }

indexing

算法

  template<typename T, typename IndexT>
  __global__ void index_add_cuda_kernel(const int64_t n, const T* input, const IndexT* index,
                                        const T* source, T* output, const int64_t stride,
                                        const int64_t source_dim, const int64_t delta,
                                        const float alpha) {
    // For x = flow.ones(5, 3)
    // source = flow.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=flow.float)
    // index = flow.tensor([0, 4, 2])
    // dim = 0
    // We have:
    // stride = 3
    // source_dim = 3
    // stride * source_dim = 9
    // alpha = 1.0
    // delta = 5 - 3 = 2

    // For i = 8
    // pre_index = i / stride_source_dim = 8 / 9 = 0 
    // dim_index = i % stride_source_dim / stride = 8 % 9 / 3 = 0
    // source_dim_idx = index[dim_index] = index[0] = 0
    // output_index = i + (delta * pre_index + source_dim_idx - dim_index) * stride = 9 + (2 * 0 + 0 -
    // 0) * 3 = 9 cuda::atomic::Add(output + output_index, static_cast<T>(alpha) * source[i])=>
    // output[9] += 1.0 * 9 = 10.0
    const int64_t stride_source_dim = stride * source_dim;
    CUDA_1D_KERNEL_LOOP(i, n) {
      int64_t pre_index = i / stride_source_dim;
      int64_t dim_index = (i - pre_index * stride_source_dim) / stride; 
      IndexT source_dim_idx = index[dim_index]; 
      int64_t output_index = i + (delta * pre_index + source_dim_idx - dim_index) * stride; 
      cuda::atomic::Add(output + output_index, static_cast<T>(alpha) * source[i]); 
    }
  }

使用宏定义


   #define REGISTER_INDEX_ADD_CUDA_KERNEL(dtype)                          \
    REGISTER_USER_KERNEL("index_add")                                    \
        .SetCreateFn<IndexAddGpuKernel<dtype>>()                         \
        .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
                        && (user_op::HobDataType("output", 0) == GetDataType<dtype>::value));

   REGISTER_INDEX_ADD_CUDA_KERNEL(float)
   REGISTER_INDEX_ADD_CUDA_KERNEL(half)
   REGISTER_INDEX_ADD_CUDA_KERNEL(double)

接口定义

  template<typename T>
  class IndexAddGpuKernel final : public user_op::OpKernel {
  public:
    IndexAddGpuKernel() = default;
    ~IndexAddGpuKernel() = default;

  private:
    using user_op::OpKernel::Compute;
    void Compute(user_op::KernelComputeContext* ctx) const override {
      const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
      const user_op::Tensor* index = ctx->Tensor4ArgNameAndIndex("index", 0);
      const user_op::Tensor* source = ctx->Tensor4ArgNameAndIndex("source", 0);
      user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
      const int64_t dim = ctx->Attr<int64_t>("dim");
      const float alpha = ctx->Attr<float>("alpha");
      const ShapeView& input_shape = input->shape_view();
      const ShapeView& source_shape = source->shape_view();
      std::vector<int64_t> input_stride(input->stride().begin(), input->stride().end());
      const int64_t stride = input_stride[dim];
      const int64_t source_dim = source_shape.At(dim);
      const int64_t delta = input_shape.At(dim) - source_dim;
      DataType index_dtype = index->data_type();
      const int32_t n = source->shape_view().elem_cnt();
      Memcpy<DeviceType::kCUDA>(
          ctx->stream(), output->mut_dptr<void>(), input->dptr<void>(),
          input->shape_view().elem_cnt() * GetSizeOfDataType(input->data_type()));
      if (GetSizeOfDataType(index_dtype) == 4) {
        RUN_CUDA_KERNEL((index_add_cuda_kernel<T, int32_t>), ctx->stream(), n, n, input->dptr<T>(),
                        index->dptr<int32_t>(), source->dptr<T>(), output->mut_dptr<T>(), stride,
                        source_dim, delta, alpha);
      } else {
        RUN_CUDA_KERNEL((index_add_cuda_kernel<T, int64_t>), ctx->stream(), n, n, input->dptr<T>(),
                        index->dptr<int64_t>(), source->dptr<T>(), output->mut_dptr<T>(), stride,
                        source_dim, delta, alpha);
      }
    }
    bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
  };

convolve

  • main

    int main(int argc, char const *argv[])
    {
        thrust::host_vector<float> h_a(N_A);
        thrust::device_vector<float> d_a(N_A);
        thrust::host_vector<float> h_b(N_B);
        thrust::device_vector<float> d_b(N_B);
    
        size_t N = h_a.size() + h_b.size() - 1;
        size_t L = pow( 2, static_cast<int>(log2(N - 1)) + 1 );
    
        thrust::host_vector<float> h_result(N);
        thrust::device_vector<float> d_result(N);
    
        thrust::sequence(h_a.begin(), h_a.end());
        // thrust::sequence(h_b.begin(), h_b.end());
        thrust::sequence(h_b.rbegin(), h_b.rend());
          
        // get raw pointer for kernel
        float *raw_point_a = thrust::raw_pointer_cast( &d_a[0] );
        float *raw_point_b = thrust::raw_pointer_cast( &d_b[0] );
        float *raw_point_result = thrust::raw_pointer_cast( &d_result[0] );
    
        int numThreads = NUM_THREADS;
        int numBlocks = (L + numThreads - 1) / numThreads;
    
        cudaEvent_t start, stop;
        checkCudaErrors( cudaEventCreate(&start) );
        checkCudaErrors( cudaEventCreate(&stop) );
        cudaEventRecord(start);
    
        // copy a b to device
        thrust::copy(h_a.begin(), h_a.end(), d_a.begin());
        thrust::copy(h_b.begin(), h_b.end(), d_b.begin());
    
        // conv(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, L, numBlocks, numThreads);
        conv2(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, N, numBlocks, numThreads);
    
        thrust::copy(d_result.begin(), d_result.end(), h_result.begin());
    
        cudaEventRecord(stop);
        checkCudaErrors( cudaThreadSynchronize() );
        float time = 0;
        cudaEventElapsedTime(&time, start, stop);
    
        cout << "run times: " << time << " ms " << endl;
    
        // for (thrust::host_vector<float>::iterator i = h_result.begin(); i != h_result.end(); ++i)
        // {
        //     cout << *i << "\t";
        // }
        cout << endl;
    
        return 0;
    }
    
  • 卷积实现

    void conv(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out, size_t numBlocks, size_t numThreads)
    {
        conv_kernel<<<numBlocks, numThreads>>>(ina, inb, out, len_a, len_b, len_out);
    }
    
    // Direct calculation convolution
    __global__ void conv_kernel(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out)
    {
        const int tid = blockIdx.x * blockDim.x + threadIdx.x;
        if (tid >= len_out)
        {
            return;
        }
    
        float sum = 0.0f;
        for (int m = 0; m < len_b; ++m)
        {
            int k = tid - m;
            if (0 <= k && k < len_a)
            {
                sum += ina[k] * inb[m];
            }
        }
        out[tid] = sum;
    }
    
  • shared memory优化

    
    void conv2(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out, size_t numBlocks, size_t numThreads)
    {   
        cudaMemcpyToSymbol(c_b, inb, len_b * sizeof(float));
        size_t sharedSize = numThreads * sizeof(float);
        conv3_kernel<<<numBlocks, numThreads, sharedSize>>>(ina, out, len_a, len_b, len_out);
    }
    

scan