GPU性能优化算法

algorithms

common algorithms

reduce

目的

Reduce就是要对一个数组求 sum，min，max，avg 等等。
- Reduce又被叫作规约，意思就是递归约减，最后获得的输出相比于输入一般维度上会递减。

baseline & version 1、2 main

  #include <cuda.h>
  #include <cuda_runtime.h>
  #include <time.h>

  #define N 32*1024*1024
  #define BLOCK_SIZE 256

  int main() {
    float *input_host = (float*)malloc(N*sizeof(float));
    float *input_device;
    cudaMalloc((void **)&input_device, N*sizeof(float));
    for (int i = 0; i < N; i++) input_host[i] = 2.0;
    cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

    int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    float *output_host = (float*)malloc((N / BLOCK_SIZE) * sizeof(float));
    float *output_device;
    cudaMalloc((void **)&output_device, (N / BLOCK_SIZE) * sizeof(float));
    
    dim3 grid(N / BLOCK_SIZE, 1);
    dim3 block(BLOCK_SIZE, 1);
    reduce_v0<<<grid, block>>>(input_device, output_device);
    cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    return 0;
  }

baseline

  __global__ void reduce_v0(float *g_idata,float *g_odata){
    __shared__ float sdata[BLOCK_SIZE];

    // each thread loads one element from global to shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
    sdata[tid] = g_idata[i];
    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=1; s < blockDim.x; s *= 2) {
        if (tid % (2*s) == 0) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
  }

version 2: interleaved addressing 目的

highly divergent warps are very inefficient

operator % is very slow

code

__global__ void reduce_v1(float *g_idata,float *g_odata){
  __shared__ float sdata[BLOCK_SIZE];

  // each thread loads one element from global to shared mem
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
  sdata[tid] = g_idata[i];
  __syncthreads();

  // do reduction in shared mem
  for(unsigned int s=1; s < blockDim.x; s *= 2) {
      // if (tid % (2*s) == 0) {
      //     sdata[tid] += sdata[tid + s];
      // }
      int index = 2 * s * tid;
      if (index < blockDim.x) {
          sdata[tid] += sdata[tid + s];
      }
      __syncthreads();
  }

  // write result for this block to global mem
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

version 3: bank conflict free 目的

解决shared memory conflict问题 code

  
__global__ void reduce_v2(float *g_idata,float *g_odata){
  __shared__ float sdata[BLOCK_SIZE];

  // each thread loads one element from global to shared mem
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
  sdata[tid] = g_idata[i];
  __syncthreads();

  // do reduction in shared mem
  for(unsigned int s=blockDim.x/2; s>0; s >>= 1) {
      if (tid < s){
          sdata[tid] += sdata[tid + s];
      }
      __syncthreads();
  }

  // write result for this block to global mem
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

version 4、5 main

 
  int main() {
    float *input_host = (float*)malloc(N*sizeof(float));
    float *input_device;
    cudaMalloc((void **)&input_device, N*sizeof(float));
    for (int i = 0; i < N; i++) input_host[i] = 2.0;
    cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

    // change /2
    int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE / 2;
    float *output_host = (float*)malloc((block_num) * sizeof(float));
    float *output_device;
    cudaMalloc((void **)&output_device, (block_num) * sizeof(float));
    
    dim3 grid(block_num, 1);
    dim3 block(BLOCK_SIZE, 1);
    reduce_v3<<<grid, block>>>(input_device, output_device);
    cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    return 0;
  }

version 4: idle thread free 目的

Half of the threads are idle on first loop iteration

use the threads properly
- 将两次加载直接转化为一次相加的结果，减少threads浪费 code

  
__global__ void reduce_v3(float *g_idata,float *g_odata){
  __shared__ float sdata[BLOCK_SIZE];

  // each thread loads one element from global to shared mem
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
  sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
  __syncthreads();

  // do reduction in shared mem
  for(unsigned int s=blockDim.x/2; s>0; s >>= 1) {
      if (tid < s){
          sdata[tid] += sdata[tid + s];
      }
      __syncthreads();
  }

  // write result for this block to global mem
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

version 5: unroll last warp 目的

When s <= 32, we have only one warp left

We don’t need to __syncthreads()
- We don’t need if (tid < s) statement code

  
__device__ void warpReduce(volatile float* cache, unsigned int tid){
  cache[tid]+=cache[tid+32];
  cache[tid]+=cache[tid+16];
  cache[tid]+=cache[tid+8];
  cache[tid]+=cache[tid+4];
  cache[tid]+=cache[tid+2];
  cache[tid]+=cache[tid+1];
}

__global__ void reduce_v4(float *g_idata,float *g_odata){
  __shared__ float sdata[BLOCK_SIZE];

  // each thread loads one element from global to shared mem
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
  sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
  __syncthreads();

  // do reduction in shared mem
  for(unsigned int s=blockDim.x/2; s>32; s >>= 1) {
      if (tid < s){
          sdata[tid] += sdata[tid + s];
      }
      __syncthreads();
  }

  // write result for this block to global mem
  if (tid < 32) warpReduce(sdata, tid);
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

version 6: complate unroll 目的

If we knew the number of iterations at compile time, we could completely unroll the reduction.

template to solve this problem code
带template的main（在编译器完成解析）

 int main() {
    float *input_host = (float*)malloc(N*sizeof(float));
    float *input_device;
    cudaMalloc((void **)&input_device, N*sizeof(float));
    for (int i = 0; i < N; i++) input_host[i] = 2.0;
    cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

    int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE / 2;
    float *output_host = (float*)malloc((block_num) * sizeof(float));
    float *output_device;
    cudaMalloc((void **)&output_device, (block_num) * sizeof(float));
      
    dim3 grid(block_num, 1);
    dim3 block(BLOCK_SIZE, 1);
    // with template
    reduce_v5<BLOCK_SIZE><<<grid, block>>>(input_device, output_device);
    cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    return 0;
 }

编译器解析后去掉if-else

 template <unsigned int blockSize>
 __device__ void warpReduce(volatile float* cache,int tid){
    if(blockSize >= 64)cache[tid]+=cache[tid+32];
    if(blockSize >= 32)cache[tid]+=cache[tid+16];
    if(blockSize >= 16)cache[tid]+=cache[tid+8];
    if(blockSize >= 8)cache[tid]+=cache[tid+4];
    if(blockSize >= 4)cache[tid]+=cache[tid+2];
    if(blockSize >= 2)cache[tid]+=cache[tid+1];
 }

 template <unsigned int blockSize>
 __global__ void reduce_v5(float *g_idata,float *g_odata){
    __shared__ float sdata[BLOCK_SIZE];

    // each thread loads one element from global to shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
    sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
    __syncthreads();

    // do reduction in shared mem
    if(blockSize>=512){
        if(tid<256){
            sdata[tid]+=sdata[tid+256];
        }
        __syncthreads();
    }
    if(blockSize>=256){
        if(tid<128){
            sdata[tid]+=sdata[tid+128];
        }
        __syncthreads();
    }
    if(blockSize>=128){
        if(tid<64){
            sdata[tid]+=sdata[tid+64];
        }
        __syncthreads();
    }
      
    // write result for this block to global mem
    if(tid<32)warpReduce<blockSize>(sdata,tid);
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 }

version 7: multiple adds 目的

code

将block数量固定，每个threads对多个数据处理main

 int main() {
    float *input_host = (float*)malloc(N*sizeof(float));
    float *input_device;
    cudaMalloc((void **)&input_device, N*sizeof(float));
    for (int i = 0; i < N; i++) input_host[i] = 2.0;
    cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

    // 将block固定到1024
    const int block_num = 1024;
    const int NUM_PER_BLOCK = N / block_num;

    // 得到每个thread 需要处理的data数目
    const int NUM_PER_THREAD = NUM_PER_BLOCK / BLOCK_SIZE;
    float *output_host = (float*)malloc((block_num) * sizeof(float));
    float *output_device;
    cudaMalloc((void **)&output_device, (block_num) * sizeof(float));
      
    dim3 grid(block_num, 1);
    dim3 block(BLOCK_SIZE, 1);
    reduce_v6<BLOCK_SIZE ,NUM_PER_THREAD><<<grid, block>>>(input_device, output_device);
    cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    return 0;
 }

    
 template <unsigned int blockSize>
 __device__ void warpReduce(volatile float* cache,int tid){
    if(blockSize >= 64)cache[tid]+=cache[tid+32];
    if(blockSize >= 32)cache[tid]+=cache[tid+16];
    if(blockSize >= 16)cache[tid]+=cache[tid+8];
    if(blockSize >= 8)cache[tid]+=cache[tid+4];
    if(blockSize >= 4)cache[tid]+=cache[tid+2];
    if(blockSize >= 2)cache[tid]+=cache[tid+1];
 }

 template <unsigned int blockSize, int NUM_PER_THREAD>
 __global__ void reduce_v6(float *g_idata,float *g_odata){
    __shared__ float sdata[BLOCK_SIZE];

    // each thread loads one element from global to shared mem

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x * NUM_PER_THREAD) + threadIdx.x;
    sdata[tid] = 0;
    // 将多个数据进行加载
    #pragma unroll
    for(int iter=0; iter<NUM_PER_THREAD; iter++){
        sdata[tid] += g_idata[i+iter*blockSize];
    }
    __syncthreads();

    // do reduction in shared mem
    if(blockSize>=512){
        if(tid<256){
            sdata[tid]+=sdata[tid+256];
        }
        __syncthreads();
    }
    if(blockSize>=256){
        if(tid<128){
            sdata[tid]+=sdata[tid+128];
        }
        __syncthreads();
    }
    if(blockSize>=128){
        if(tid<64){
            sdata[tid]+=sdata[tid+64];
        }
        __syncthreads();
    }
      
    // write result for this block to global mem
    if(tid<32)warpReduce<blockSize>(sdata,tid);
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 }

elmmentwise

原来

  int main(){
    half *x_host = (half*)malloc(N*sizeof(half));
    half *x_device;
    cudaMalloc((void **)&x_device, N*sizeof(half));
    for (int i = 0; i < N; i++) x_host[i] = 2.0;
    cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *y_host = (half*)malloc(N*sizeof(half));
    half *y_device;
    cudaMalloc((void **)&y_device, N*sizeof(half));
    for (int i = 0; i < N; i++) y_host[i] = 2.0;
    cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *output_host = (half*)malloc(N * sizeof(half));
    half *output_device;
    cudaMalloc((void **)&output_device, N * sizeof(half));

    // naive elementwise
    int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
    dim3 grid(block_num, 1);
    dim3 block(kBlockSize, 1);
    mul<half><<<grid, block>>>(x_device, y_device, output_device);
    cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);

    // elementwise template
    Binary(MultiplyFunctor<half>(), N, output_device, x_device, y_device);
    cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);
    free(x_host);
    free(y_host);
    free(output_host);
    cudaFree(x_device);
    cudaFree(y_device);
    cudaFree(output_device);
    return 0;
  }

  
  template<typename T>
    __global__ void mul(T *x, T *y, T* z){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    z[idx] = x[idx] * y[idx];
  }

  template<>
    __global__ void mul(half *x, half *y, half* z){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    z[idx] = x[idx] * y[idx];
  }

优化

  int main(){
    half *x_host = (half*)malloc(N*sizeof(half));
    half *x_device;
    cudaMalloc((void **)&x_device, N*sizeof(half));
    for (int i = 0; i < N; i++) x_host[i] = 2.0;
    cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *y_host = (half*)malloc(N*sizeof(half));
    half *y_device;
    cudaMalloc((void **)&y_device, N*sizeof(half));
    for (int i = 0; i < N; i++) y_host[i] = 2.0;
    cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *output_host = (half*)malloc(N * sizeof(half));
    half *output_device;
    cudaMalloc((void **)&output_device, N * sizeof(half));


    int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
    dim3 grid(block_num, 1);
    dim3 block(kBlockSize, 1);

    // elementwise template
    Binary(MultiplyFunctor<half>(), N, output_device, x_device, y_device);
    cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);
    free(x_host);
    free(y_host);
    free(output_host);
    cudaFree(x_device);
    cudaFree(y_device);
    cudaFree(output_device);
    return 0;
  }

 
  template<typename T>
  constexpr int PackSize() {
    return Min(kMaxPackBytes / sizeof(T), kMaxPackSize);
  }

  template<typename T, typename U, typename... Args>
  constexpr int PackSize() {
    return Min(PackSize<T>(), PackSize<U, Args...>());
  }

  template<typename T>
  class HasApply2 {
    typedef char one;
    struct two {
      char x[2];
    };

    template<typename C>
    static one test(decltype(&C::Apply2));
    template<typename C>
    static two test(...);

  public:
    enum { value = sizeof(test<T>(0)) == sizeof(char) };
  };

  // 判断 Functor 是否支持两个两个的操作，比如 half2。如果 Functor 里定义了 apply2，那么 test 就会匹配到 one 函数，返回的是 sizeof char，value 是 true。比如 half 可以 Pack 一次 8 个，但是有 __half22float2 这种针对 half2 的操作，那它就可以两个两个的做。可以看到对于 half2 类型的 Element-Wise 操作我们需要给对应的 Functor 定义一个 Apply2 函数，比如对于 Cast 操作的 Functor 定义如下：
  template<int pack_size, typename FunctorT, typename R, typename... IN>
  __device__ typename std::enable_if<HasApply2<FunctorT>::value == true && pack_size % 2 == 0,
                                    Packed<R, pack_size>>::type
  ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
    Packed<R, pack_size> ret;
  #pragma unroll
    for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in.elem + j)...); }
    return ret;
  }

  template<int pack_size, typename FunctorT, typename R, typename... IN>
  __device__ typename std::enable_if<HasApply2<FunctorT>::value == false || pack_size % 2 != 0,
                                    Packed<R, pack_size>>::type
  ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
    Packed<R, pack_size> ret;
  #pragma unroll
    for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in.elem[j])...); }
    return ret;
  }

  template<int pack_size, typename FactoryT, typename R, typename... IN>
  __global__ void __launch_bounds__(kBlockSize)
      ApplyGeneric(FactoryT factory, int64_t n_pack, Packed<R, pack_size>* pack_r,
                  const Packed<IN, pack_size>*... pack_in, int64_t n_tail, R* tail_r,
                  const IN*... tail_in) {
    auto functor = factory();
    const int global_tid = blockIdx.x * kBlockSize + threadIdx.x;
    for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) {
      pack_r[i] = ApplyPack<pack_size, decltype(functor), R, IN...>(functor, (pack_in[i])...);
    }
    if (global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); }
  }

  template<typename FunctorT>
  struct SimpleFactory {
    explicit SimpleFactory(FunctorT functor) : tpl(functor) {}
    __device__ FunctorT operator()() const { return tpl; }

  private:
    FunctorT tpl;
  };

  template<size_t pack_size>
  bool IsAlignedForPack() {
    return true;
  }

  template<size_t pack_size, typename T, typename... Args>
  bool IsAlignedForPack(const T* ptr, const Args*... others) {
    return reinterpret_cast<uintptr_t>(ptr) % sizeof(Pack<T, pack_size>) == 0
          && IsAlignedForPack<pack_size, Args...>(others...);
  }

  template<size_t pack_size, typename FactoryT, typename R, typename... IN>
  cudaError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in) {
    const int64_t n_pack = n / pack_size;
    const int64_t tail_offset = n_pack * pack_size;
    const int64_t n_tail = n - tail_offset;
    int num_blocks;
    {
      cudaError_t err = GetNumBlocks(n_pack, &num_blocks);
      if (err != cudaSuccess) { return err; }
    }
    ApplyGeneric<pack_size, FactoryT, R, IN...><<<num_blocks, kBlockSize, 0>>>(
        factory, n_pack, reinterpret_cast<Packed<R, pack_size>*>(r),
        (reinterpret_cast<const Packed<IN, pack_size>*>(in))..., n_tail, r + tail_offset,
        (in + tail_offset)...);
    return cudaPeekAtLastError();
  }

  template<typename FactoryT, typename R, typename... IN>
  struct GenericLauncher {
    static cudaError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in) {
      constexpr int max_pack_size = PackSize<R, IN...>();
      if (IsAlignedForPack<max_pack_size, R, IN...>(r, in...)) {
        return LaunchKernel<max_pack_size, FactoryT, R, IN...>(factory, n, r, in...);
      } else {
        return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in...);
      }
    }
  };

  template<typename FactoryT, typename R, typename A>
  inline cudaError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a) {
    return GenericLauncher<FactoryT, R, A>::Launch(factory, n, r, a);
  }

  template<typename FunctorT, typename R, typename A>
  inline cudaError_t Unary(FunctorT functor, int64_t n, R* r, const A* a) {
    return UnaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a);
  }

  template<typename FactoryT, typename R, typename A, typename B>
  inline cudaError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b) {
    return GenericLauncher<FactoryT, R, A, B>::Launch(factory, n, r, a, b);
  }

  template<typename FunctorT, typename R, typename A, typename B>
  inline cudaError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b) {
    return BinaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b);
  }

  template<typename FactoryT, typename R, typename A, typename B, typename C>
  inline cudaError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
                                        const C* c) {
    return GenericLauncher<FactoryT, R, A, B, C>::Launch(factory, n, r, a, b, c);
  }

  template<typename FunctorT, typename R, typename A, typename B, typename C>
  inline cudaError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c) {
    return TernaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, c);
  }

  template<typename T>
  struct MultiplyFunctor {
    __device__ T operator()(T x, T y) const {
      return x*y;
    }
  };

  template<>
  struct MultiplyFunctor<half> {
    __device__ half operator()(half x, half y) const {
      return x*y;
    }
  #if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
    __device__ void Apply2(half* z, const half* x, const half* y) const {
      const half2 x2 = *(reinterpret_cast<const half2*>(x));
      const half2 y2 = *(reinterpret_cast<const half2*>(y));
      *reinterpret_cast<half2*>(z) = __hmul2(x2, y2);
    }
  #endif  // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
  };

fast atomic add

atomic add

main

 int main(){
    half *x_host = (half*)malloc(N*sizeof(half));
    half *x_device;
    cudaMalloc((void **)&x_device, N*sizeof(half));
    for (int i = 0; i < N; i++) x_host[i] = 0.1;
    cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *y_host = (half*)malloc(N*sizeof(half));
    half *y_device;
    cudaMalloc((void **)&y_device, N*sizeof(half));
    for (int i = 0; i < N; i++) y_host[i] = 0.1;
    cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *output_host = (half*)malloc(sizeof(half));
    half *output_device;
    cudaMalloc((void **)&output_device, sizeof(half));
    cudaMemset(&output_device, 0, sizeof(half));

    int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
    dim3 grid(block_num, 1);
    dim3 block(kBlockSize, 1);
    dot<<<grid, block>>>(x_device, y_device, output_device, N);
    cudaMemcpy(output_host, output_device, sizeof(half), cudaMemcpyDeviceToHost);
    printf("%.6f\n", static_cast<double>(output_host[0]));
    free(x_host);
    free(y_host);
    free(output_host);
    cudaFree(x_device);
    cudaFree(y_device);
    cudaFree(output_device);
 }

cuda

    
 __global__ void dot(half* a, half* b, half* c, int n){
    const int nStep = gridDim.x * blockDim.x;
    half temp = 0.0;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    while (gid < n) {
        temp = temp + a[gid] * b[gid];
        gid += nStep;
    }
    atomicAdd(c, temp);
 }

half pack2

half类型的原子加转换成half2类型的原子加

main

 int main(){
    half *x_host = (half*)malloc(N*sizeof(half));
    half *x_device;
    cudaMalloc((void **)&x_device, N*sizeof(half));
    for (int i = 0; i < N; i++) x_host[i] = 1.0;
    cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);
    Pack<half, 2>* x_pack = reinterpret_cast<Pack<half, 2>*>(x_device);

    half *y_host = (half*)malloc(N*sizeof(half));
    half *y_device;
    cudaMalloc((void **)&y_device, N*sizeof(half));
    for (int i = 0; i < N; i++) y_host[i] = 1.0;
    cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);
    Pack<half, 2>* y_pack = reinterpret_cast<Pack<half, 2>*>(y_device);

    half *output_host = (half*)malloc(2 * sizeof(half));
    half *output_device;
    cudaMalloc((void **)&output_device, 2 * sizeof(half));
    cudaMemset(&output_device, 0, sizeof(half) * 2);
    Pack<half, 2>* output_pack = reinterpret_cast<Pack<half, 2>*>(output_device);

    int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
    dim3 grid(block_num, 1);
    dim3 block(kBlockSize, 1);
    dot<half, 2><<<grid, block>>>(x_pack, y_pack, output_pack, N);
    cudaMemcpy(output_host, output_device, 2 * sizeof(half), cudaMemcpyDeviceToHost);
    printf("%.6f\n", static_cast<double>(output_host[0]));
    free(x_host);
    free(y_host);
    free(output_host);
    cudaFree(x_device);
    cudaFree(y_device);
    cudaFree(output_device);
 }

pack2

 template<typename T, size_t pack_size>
 struct alignas(sizeof(T) * pack_size) Pack {
   T elem[pack_size];
 };

    
 template<typename T, int32_t pack_size>
 __global__ void dot(Pack<T, pack_size>* a, Pack<T, pack_size>* b, Pack<T, pack_size>* c, int n){
    const int nStep = gridDim.x * blockDim.x;
    T temp = 0.0;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    while (gid < n / pack_size) {
        for (int i = 0; i < pack_size; i++) {
            temp = temp + a[gid].elem[i] * b[gid].elem[i];
        }
        gid += nStep;
    }
    AtomicAdd<T, pack_size>(c, temp);
 }

AtomicAdd template和特化版本

 // 函数特化版本
 template<>
 __device__ __inline__ void AtomicAdd<half, 2>(Pack<half, 2>* address, half val) {
  half2 h2_val;
  h2_val.x = static_cast<half>(val);
  h2_val.y = static_cast<half>(val);
  atomicAdd(reinterpret_cast<half2*>(address), h2_val);
 }

 template<typename T, int32_t pack_size>
 __device__ __inline__ void AtomicAdd(Pack<T, pack_size>* address,
                                    T val) {
 #pragma unroll
  for (int i = 0; i < pack_size; ++i) {
    atomicAdd(reinterpret_cast<T*>(address) + i, static_cast<T>(val));
  }
 }

fast atomic add

main

 int main(){
    half *x_host = (half*)malloc(N*sizeof(half));
    half *x_device;
    cudaMalloc((void **)&x_device, N*sizeof(half));
    for (int i = 0; i < N; i++) x_host[i] = 0.1;
    cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *y_host = (half*)malloc(N*sizeof(half));
    half *y_device;
    cudaMalloc((void **)&y_device, N*sizeof(half));
    for (int i = 0; i < N; i++) y_host[i] = 0.1;
    cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);

    half *output_host = (half*)malloc(sizeof(half));
    half *output_device;
    cudaMalloc((void **)&output_device, sizeof(half));
    cudaMemset(&output_device, 0, sizeof(half));

    int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
    dim3 grid(block_num, 1);
    dim3 block(kBlockSize, 1);
    dot<<<grid, block>>>(x_device, y_device, output_device, N);
    cudaMemcpy(output_host, output_device, sizeof(half), cudaMemcpyDeviceToHost);
    printf("%.6f\n", static_cast<double>(output_host[0]));
    free(x_host);
    free(y_host);
    free(output_host);
    cudaFree(x_device);
    cudaFree(y_device);
    cudaFree(output_device);
 }

    
 __global__ void dot(half* a, half* b, half* c, int n){
    const int nStep = gridDim.x * blockDim.x;
    half temp = 0.0;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    while (gid < n) {
        temp = temp + a[gid] * b[gid];
        gid += nStep;
    }
    // atomicAdd(c, temp);
    FastAdd(c, 0, N, temp);
 }

实现

   
 // FastAdd is referenced from
 // https://github.com/pytorch/pytorch/blob/396c3b1d88d7624938a2bb0b287f2a19f1e89bb4/aten/src/ATen/native/cuda/KernelUtils.cuh#L29
 // 如果T类型是half类型
 template<typename T, typename std::enable_if<std::is_same<half, T>::value>::type* = nullptr>
 __device__ __forceinline__ void FastSpecializedAtomicAdd(T* base, size_t offset,
                                                        const size_t length, T value) {
 #if ((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) \
    || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
  atomicAdd(reinterpret_cast<half*>(base) + offset, static_cast<half>(value));
 #else
  // Accounts for the chance base falls on an odd 16 bit alignment (ie, not 32 bit aligned)
  __half* target_addr = reinterpret_cast<__half*>(base + offset);
  bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
  if (low_byte && offset < (length - 1)) {
    __half2 value2;
    value2.x = value;
    value2.y = __float2half_rz(0);
    atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);

  } else if (!low_byte && offset > 0) {
    __half2 value2;
    value2.x = __float2half_rz(0);
    value2.y = value;
    atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);

  } else {
    atomicAdd(reinterpret_cast<__half*>(base) + offset, static_cast<__half>(value));
  }
 #endif
 }
 // 如果T类型不是half类型
 template<typename T, typename std::enable_if<!std::is_same<half, T>::value>::type* = nullptr>
 __device__ __forceinline__ void FastSpecializedAtomicAdd(T* base, size_t offset,
                                                        const size_t length, T value) {
  atomicAdd(base + offset, value);
 }

 template<class T>
 __device__ __forceinline__ void FastAdd(T* base, size_t offset, const size_t length, T value) {
  FastSpecializedAtomicAdd(base, offset, length, value);
 }

unsample nearest 2D

实现

    
  int main(){
      float *input_host = (float*)malloc(N*sizeof(float));
      float *input_device;
      cudaMalloc((void **)&input_device, N*sizeof(float));
      for (int i = 0; i < N; i++) input_host[i] = 1.0;
      cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);

      float *output_host = (float*)malloc(N * 4 * sizeof(float));
      float *output_device;
      cudaMalloc((void **)&output_device, N * 4 * sizeof(float));
      
      dim3 grid(N / kBlockSize, 1);
      dim3 block(kBlockSize, 1);
      UpsampleNearest2D2XForward<<<grid, block>>>(N, input_device, 1024, 1024, output_device);
      cudaMemcpy(output_host, output_device, N * 4 * sizeof(float), cudaMemcpyDeviceToHost);
      for(int i = 0; i < 50; i++) {
          printf("%.5f\n", output_host[i]);
      }

      return 0;
   }

  #define CUDA_1D_KERNEL_LOOP(i, n)                                                                 \
    for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
       i += step)
  template<typename T>
  struct alignas(2 * sizeof(T)) Pack2X {
    T x;
    T y;
  };
  template<typename T>
  __global__ void UpsampleNearest2D2XForward(const int32_t in_elem_cnt, const T* in_dptr,
                                            const int32_t in_height, const int32_t in_width,
                                            T* out_dptr) {
    const int32_t in_hw_size = in_width * in_height;
    CUDA_1D_KERNEL_LOOP(index, in_elem_cnt) {
      const T in_value = in_dptr[index];
      const int32_t nc_idx = index / in_hw_size;
      const int32_t hw_off = index - nc_idx * in_hw_size;
      const int32_t h = hw_off / in_width;
      const int32_t w = hw_off - h * in_width;
      Pack2X<T> out_value{in_value, in_value};
      Pack2X<T>* out_pack_dptr = reinterpret_cast<Pack2X<T>*>(out_dptr);
      out_pack_dptr[nc_idx * in_hw_size * 2 + h * 2 * in_width + w] = out_value;
      out_pack_dptr[nc_idx * in_hw_size * 2 + (h * 2 + 1) * in_width + w] = out_value;
    }
  }

  template<typename T>
  __global__ void UpsampleNearest2D2XBackward(const int32_t in_elem_cnt, const T* dy_dptr,
                                              const int32_t dx_height, const int32_t dx_width,
                                              T* dx_dptr) {
    const int32_t dx_hw_size = dx_height * dx_width;
    CUDA_1D_KERNEL_LOOP(index, in_elem_cnt) {
      T dx_value = 0.0;
      const int32_t nc_idx = index / dx_hw_size;
      const int32_t dx_hw_off = index - nc_idx * dx_hw_size;
      const int32_t dx_h = dx_hw_off / dx_width;
      const int32_t dx_w = dx_hw_off - dx_h * dx_width;
      const Pack2X<T>* dy_pack_dptr = reinterpret_cast<const Pack2X<T>*>(dy_dptr);
      const Pack2X<T> dy_pack_value1 =
          dy_pack_dptr[nc_idx * dx_hw_size * 2 + dx_h * 2 * dx_width + dx_w];
      const Pack2X<T> dy_pack_value2 =
          dy_pack_dptr[nc_idx * dx_hw_size * 2 + (dx_h * 2 + 1) * dx_width + dx_w];
      dx_value += dy_pack_value1.x;
      dx_value += dy_pack_value1.y;
      dx_value += dy_pack_value2.x;
      dx_value += dy_pack_value2.y;
      dx_dptr[index] = dx_value;
    }
  }

indexing

算法

  template<typename T, typename IndexT>
  __global__ void index_add_cuda_kernel(const int64_t n, const T* input, const IndexT* index,
                                        const T* source, T* output, const int64_t stride,
                                        const int64_t source_dim, const int64_t delta,
                                        const float alpha) {
    // For x = flow.ones(5, 3)
    // source = flow.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=flow.float)
    // index = flow.tensor([0, 4, 2])
    // dim = 0
    // We have:
    // stride = 3
    // source_dim = 3
    // stride * source_dim = 9
    // alpha = 1.0
    // delta = 5 - 3 = 2

    // For i = 8
    // pre_index = i / stride_source_dim = 8 / 9 = 0 
    // dim_index = i % stride_source_dim / stride = 8 % 9 / 3 = 0
    // source_dim_idx = index[dim_index] = index[0] = 0
    // output_index = i + (delta * pre_index + source_dim_idx - dim_index) * stride = 9 + (2 * 0 + 0 -
    // 0) * 3 = 9 cuda::atomic::Add(output + output_index, static_cast<T>(alpha) * source[i])=>
    // output[9] += 1.0 * 9 = 10.0
    const int64_t stride_source_dim = stride * source_dim;
    CUDA_1D_KERNEL_LOOP(i, n) {
      int64_t pre_index = i / stride_source_dim;
      int64_t dim_index = (i - pre_index * stride_source_dim) / stride; 
      IndexT source_dim_idx = index[dim_index]; 
      int64_t output_index = i + (delta * pre_index + source_dim_idx - dim_index) * stride; 
      cuda::atomic::Add(output + output_index, static_cast<T>(alpha) * source[i]); 
    }
  }

使用宏定义

   #define REGISTER_INDEX_ADD_CUDA_KERNEL(dtype)                          \
    REGISTER_USER_KERNEL("index_add")                                    \
        .SetCreateFn<IndexAddGpuKernel<dtype>>()                         \
        .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
                        && (user_op::HobDataType("output", 0) == GetDataType<dtype>::value));

   REGISTER_INDEX_ADD_CUDA_KERNEL(float)
   REGISTER_INDEX_ADD_CUDA_KERNEL(half)
   REGISTER_INDEX_ADD_CUDA_KERNEL(double)

接口定义

  template<typename T>
  class IndexAddGpuKernel final : public user_op::OpKernel {
  public:
    IndexAddGpuKernel() = default;
    ~IndexAddGpuKernel() = default;

  private:
    using user_op::OpKernel::Compute;
    void Compute(user_op::KernelComputeContext* ctx) const override {
      const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
      const user_op::Tensor* index = ctx->Tensor4ArgNameAndIndex("index", 0);
      const user_op::Tensor* source = ctx->Tensor4ArgNameAndIndex("source", 0);
      user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
      const int64_t dim = ctx->Attr<int64_t>("dim");
      const float alpha = ctx->Attr<float>("alpha");
      const ShapeView& input_shape = input->shape_view();
      const ShapeView& source_shape = source->shape_view();
      std::vector<int64_t> input_stride(input->stride().begin(), input->stride().end());
      const int64_t stride = input_stride[dim];
      const int64_t source_dim = source_shape.At(dim);
      const int64_t delta = input_shape.At(dim) - source_dim;
      DataType index_dtype = index->data_type();
      const int32_t n = source->shape_view().elem_cnt();
      Memcpy<DeviceType::kCUDA>(
          ctx->stream(), output->mut_dptr<void>(), input->dptr<void>(),
          input->shape_view().elem_cnt() * GetSizeOfDataType(input->data_type()));
      if (GetSizeOfDataType(index_dtype) == 4) {
        RUN_CUDA_KERNEL((index_add_cuda_kernel<T, int32_t>), ctx->stream(), n, n, input->dptr<T>(),
                        index->dptr<int32_t>(), source->dptr<T>(), output->mut_dptr<T>(), stride,
                        source_dim, delta, alpha);
      } else {
        RUN_CUDA_KERNEL((index_add_cuda_kernel<T, int64_t>), ctx->stream(), n, n, input->dptr<T>(),
                        index->dptr<int64_t>(), source->dptr<T>(), output->mut_dptr<T>(), stride,
                        source_dim, delta, alpha);
      }
    }
    bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
  };

convolve

main

int main(int argc, char const *argv[])
{
    thrust::host_vector<float> h_a(N_A);
    thrust::device_vector<float> d_a(N_A);
    thrust::host_vector<float> h_b(N_B);
    thrust::device_vector<float> d_b(N_B);

    size_t N = h_a.size() + h_b.size() - 1;
    size_t L = pow( 2, static_cast<int>(log2(N - 1)) + 1 );

    thrust::host_vector<float> h_result(N);
    thrust::device_vector<float> d_result(N);

    thrust::sequence(h_a.begin(), h_a.end());
    // thrust::sequence(h_b.begin(), h_b.end());
    thrust::sequence(h_b.rbegin(), h_b.rend());
      
    // get raw pointer for kernel
    float *raw_point_a = thrust::raw_pointer_cast( &d_a[0] );
    float *raw_point_b = thrust::raw_pointer_cast( &d_b[0] );
    float *raw_point_result = thrust::raw_pointer_cast( &d_result[0] );

    int numThreads = NUM_THREADS;
    int numBlocks = (L + numThreads - 1) / numThreads;

    cudaEvent_t start, stop;
    checkCudaErrors( cudaEventCreate(&start) );
    checkCudaErrors( cudaEventCreate(&stop) );
    cudaEventRecord(start);

    // copy a b to device
    thrust::copy(h_a.begin(), h_a.end(), d_a.begin());
    thrust::copy(h_b.begin(), h_b.end(), d_b.begin());

    // conv(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, L, numBlocks, numThreads);
    conv2(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, N, numBlocks, numThreads);

    thrust::copy(d_result.begin(), d_result.end(), h_result.begin());

    cudaEventRecord(stop);
    checkCudaErrors( cudaThreadSynchronize() );
    float time = 0;
    cudaEventElapsedTime(&time, start, stop);

    cout << "run times: " << time << " ms " << endl;

    // for (thrust::host_vector<float>::iterator i = h_result.begin(); i != h_result.end(); ++i)
    // {
    //     cout << *i << "\t";
    // }
    cout << endl;

    return 0;
}

卷积实现

void conv(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out, size_t numBlocks, size_t numThreads)
{
    conv_kernel<<<numBlocks, numThreads>>>(ina, inb, out, len_a, len_b, len_out);
}

// Direct calculation convolution
__global__ void conv_kernel(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= len_out)
    {
        return;
    }

    float sum = 0.0f;
    for (int m = 0; m < len_b; ++m)
    {
        int k = tid - m;
        if (0 <= k && k < len_a)
        {
            sum += ina[k] * inb[m];
        }
    }
    out[tid] = sum;
}

shared memory优化

void conv2(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out, size_t numBlocks, size_t numThreads)
{   
    cudaMemcpyToSymbol(c_b, inb, len_b * sizeof(float));
    size_t sharedSize = numThreads * sizeof(float);
    conv3_kernel<<<numBlocks, numThreads, sharedSize>>>(ina, out, len_a, len_b, len_out);
}

scan

上篇GPU Program Model

下篇GPU Shared Memory Model