algorithms
common algorithms
reduce
目的
- Reduce就是要对一个数组求 sum,min,max,avg 等等。
- Reduce又被叫作规约,意思就是递归约减,最后获得的输出相比于输入一般维度上会递减。
baseline & version 1、2 main
#include <cuda.h>
#include <cuda_runtime.h>
#include <time.h>
#define N 32*1024*1024
#define BLOCK_SIZE 256
int main() {
float *input_host = (float*)malloc(N*sizeof(float));
float *input_device;
cudaMalloc((void **)&input_device, N*sizeof(float));
for (int i = 0; i < N; i++) input_host[i] = 2.0;
cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);
int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
float *output_host = (float*)malloc((N / BLOCK_SIZE) * sizeof(float));
float *output_device;
cudaMalloc((void **)&output_device, (N / BLOCK_SIZE) * sizeof(float));
dim3 grid(N / BLOCK_SIZE, 1);
dim3 block(BLOCK_SIZE, 1);
reduce_v0<<<grid, block>>>(input_device, output_device);
cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}
baseline
__global__ void reduce_v0(float *g_idata,float *g_odata){
__shared__ float sdata[BLOCK_SIZE];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
version 2: interleaved addressing 目的
- highly divergent warps are very inefficient
- operator % is very slow
code
__global__ void reduce_v1(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for(unsigned int s=1; s < blockDim.x; s *= 2) { // if (tid % (2*s) == 0) { // sdata[tid] += sdata[tid + s]; // } int index = 2 * s * tid; if (index < blockDim.x) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
version 3: bank conflict free 目的
-
解决shared memory conflict问题 code
__global__ void reduce_v2(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for(unsigned int s=blockDim.x/2; s>0; s >>= 1) { if (tid < s){ sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
version 4、5 main
int main() {
float *input_host = (float*)malloc(N*sizeof(float));
float *input_device;
cudaMalloc((void **)&input_device, N*sizeof(float));
for (int i = 0; i < N; i++) input_host[i] = 2.0;
cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);
// change /2
int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE / 2;
float *output_host = (float*)malloc((block_num) * sizeof(float));
float *output_device;
cudaMalloc((void **)&output_device, (block_num) * sizeof(float));
dim3 grid(block_num, 1);
dim3 block(BLOCK_SIZE, 1);
reduce_v3<<<grid, block>>>(input_device, output_device);
cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}
version 4: idle thread free 目的
- Half of the threads are idle on first loop iteration
- use the threads properly
- 将两次加载直接转化为一次相加的结果,减少threads浪费 code
__global__ void reduce_v3(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i + blockDim.x]; __syncthreads(); // do reduction in shared mem for(unsigned int s=blockDim.x/2; s>0; s >>= 1) { if (tid < s){ sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
- use the threads properly
version 5: unroll last warp 目的
- When s <= 32, we have only one warp left
- We don’t need to __syncthreads()
- We don’t need if (tid < s) statement code
__device__ void warpReduce(volatile float* cache, unsigned int tid){ cache[tid]+=cache[tid+32]; cache[tid]+=cache[tid+16]; cache[tid]+=cache[tid+8]; cache[tid]+=cache[tid+4]; cache[tid]+=cache[tid+2]; cache[tid]+=cache[tid+1]; } __global__ void reduce_v4(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i + blockDim.x]; __syncthreads(); // do reduction in shared mem for(unsigned int s=blockDim.x/2; s>32; s >>= 1) { if (tid < s){ sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid < 32) warpReduce(sdata, tid); if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
- We don’t need to __syncthreads()
version 6: complate unroll 目的
- If we knew the number of iterations at compile time, we could completely unroll the reduction.
- template to solve this problem code
- 带template的main(在编译器完成解析)
int main() { float *input_host = (float*)malloc(N*sizeof(float)); float *input_device; cudaMalloc((void **)&input_device, N*sizeof(float)); for (int i = 0; i < N; i++) input_host[i] = 2.0; cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice); int32_t block_num = (N + BLOCK_SIZE - 1) / BLOCK_SIZE / 2; float *output_host = (float*)malloc((block_num) * sizeof(float)); float *output_device; cudaMalloc((void **)&output_device, (block_num) * sizeof(float)); dim3 grid(block_num, 1); dim3 block(BLOCK_SIZE, 1); // with template reduce_v5<BLOCK_SIZE><<<grid, block>>>(input_device, output_device); cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost); return 0; }
- 编译器解析后去掉if-else
template <unsigned int blockSize> __device__ void warpReduce(volatile float* cache,int tid){ if(blockSize >= 64)cache[tid]+=cache[tid+32]; if(blockSize >= 32)cache[tid]+=cache[tid+16]; if(blockSize >= 16)cache[tid]+=cache[tid+8]; if(blockSize >= 8)cache[tid]+=cache[tid+4]; if(blockSize >= 4)cache[tid]+=cache[tid+2]; if(blockSize >= 2)cache[tid]+=cache[tid+1]; } template <unsigned int blockSize> __global__ void reduce_v5(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i + blockDim.x]; __syncthreads(); // do reduction in shared mem if(blockSize>=512){ if(tid<256){ sdata[tid]+=sdata[tid+256]; } __syncthreads(); } if(blockSize>=256){ if(tid<128){ sdata[tid]+=sdata[tid+128]; } __syncthreads(); } if(blockSize>=128){ if(tid<64){ sdata[tid]+=sdata[tid+64]; } __syncthreads(); } // write result for this block to global mem if(tid<32)warpReduce<blockSize>(sdata,tid); if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
version 7: multiple adds 目的
code
-
将block数量固定,每个threads对多个数据处理main
int main() { float *input_host = (float*)malloc(N*sizeof(float)); float *input_device; cudaMalloc((void **)&input_device, N*sizeof(float)); for (int i = 0; i < N; i++) input_host[i] = 2.0; cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice); // 将block固定到1024 const int block_num = 1024; const int NUM_PER_BLOCK = N / block_num; // 得到每个thread 需要处理的data数目 const int NUM_PER_THREAD = NUM_PER_BLOCK / BLOCK_SIZE; float *output_host = (float*)malloc((block_num) * sizeof(float)); float *output_device; cudaMalloc((void **)&output_device, (block_num) * sizeof(float)); dim3 grid(block_num, 1); dim3 block(BLOCK_SIZE, 1); reduce_v6<BLOCK_SIZE ,NUM_PER_THREAD><<<grid, block>>>(input_device, output_device); cudaMemcpy(output_host, output_device, block_num * sizeof(float), cudaMemcpyDeviceToHost); return 0; }
template <unsigned int blockSize> __device__ void warpReduce(volatile float* cache,int tid){ if(blockSize >= 64)cache[tid]+=cache[tid+32]; if(blockSize >= 32)cache[tid]+=cache[tid+16]; if(blockSize >= 16)cache[tid]+=cache[tid+8]; if(blockSize >= 8)cache[tid]+=cache[tid+4]; if(blockSize >= 4)cache[tid]+=cache[tid+2]; if(blockSize >= 2)cache[tid]+=cache[tid+1]; } template <unsigned int blockSize, int NUM_PER_THREAD> __global__ void reduce_v6(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x * NUM_PER_THREAD) + threadIdx.x; sdata[tid] = 0; // 将多个数据进行加载 #pragma unroll for(int iter=0; iter<NUM_PER_THREAD; iter++){ sdata[tid] += g_idata[i+iter*blockSize]; } __syncthreads(); // do reduction in shared mem if(blockSize>=512){ if(tid<256){ sdata[tid]+=sdata[tid+256]; } __syncthreads(); } if(blockSize>=256){ if(tid<128){ sdata[tid]+=sdata[tid+128]; } __syncthreads(); } if(blockSize>=128){ if(tid<64){ sdata[tid]+=sdata[tid+64]; } __syncthreads(); } // write result for this block to global mem if(tid<32)warpReduce<blockSize>(sdata,tid); if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
elmmentwise
原来
int main(){
half *x_host = (half*)malloc(N*sizeof(half));
half *x_device;
cudaMalloc((void **)&x_device, N*sizeof(half));
for (int i = 0; i < N; i++) x_host[i] = 2.0;
cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);
half *y_host = (half*)malloc(N*sizeof(half));
half *y_device;
cudaMalloc((void **)&y_device, N*sizeof(half));
for (int i = 0; i < N; i++) y_host[i] = 2.0;
cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);
half *output_host = (half*)malloc(N * sizeof(half));
half *output_device;
cudaMalloc((void **)&output_device, N * sizeof(half));
// naive elementwise
int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
dim3 grid(block_num, 1);
dim3 block(kBlockSize, 1);
mul<half><<<grid, block>>>(x_device, y_device, output_device);
cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);
// elementwise template
Binary(MultiplyFunctor<half>(), N, output_device, x_device, y_device);
cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);
free(x_host);
free(y_host);
free(output_host);
cudaFree(x_device);
cudaFree(y_device);
cudaFree(output_device);
return 0;
}
template<typename T>
__global__ void mul(T *x, T *y, T* z){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
z[idx] = x[idx] * y[idx];
}
template<>
__global__ void mul(half *x, half *y, half* z){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
z[idx] = x[idx] * y[idx];
}
优化
int main(){
half *x_host = (half*)malloc(N*sizeof(half));
half *x_device;
cudaMalloc((void **)&x_device, N*sizeof(half));
for (int i = 0; i < N; i++) x_host[i] = 2.0;
cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice);
half *y_host = (half*)malloc(N*sizeof(half));
half *y_device;
cudaMalloc((void **)&y_device, N*sizeof(half));
for (int i = 0; i < N; i++) y_host[i] = 2.0;
cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice);
half *output_host = (half*)malloc(N * sizeof(half));
half *output_device;
cudaMalloc((void **)&output_device, N * sizeof(half));
int32_t block_num = (N + kBlockSize - 1) / kBlockSize;
dim3 grid(block_num, 1);
dim3 block(kBlockSize, 1);
// elementwise template
Binary(MultiplyFunctor<half>(), N, output_device, x_device, y_device);
cudaMemcpy(output_host, output_device, N * sizeof(half), cudaMemcpyDeviceToHost);
free(x_host);
free(y_host);
free(output_host);
cudaFree(x_device);
cudaFree(y_device);
cudaFree(output_device);
return 0;
}
template<typename T>
constexpr int PackSize() {
return Min(kMaxPackBytes / sizeof(T), kMaxPackSize);
}
template<typename T, typename U, typename... Args>
constexpr int PackSize() {
return Min(PackSize<T>(), PackSize<U, Args...>());
}
template<typename T>
class HasApply2 {
typedef char one;
struct two {
char x[2];
};
template<typename C>
static one test(decltype(&C::Apply2));
template<typename C>
static two test(...);
public:
enum { value = sizeof(test<T>(0)) == sizeof(char) };
};
// 判断 Functor 是否支持两个两个的操作,比如 half2。如果 Functor 里定义了 apply2,那么 test 就会匹配到 one 函数,返回的是 sizeof char,value 是 true。比如 half 可以 Pack 一次 8 个,但是有 __half22float2 这种针对 half2 的操作,那它就可以两个两个的做。可以看到对于 half2 类型的 Element-Wise 操作我们需要给对应的 Functor 定义一个 Apply2 函数,比如对于 Cast 操作的 Functor 定义如下:
template<int pack_size, typename FunctorT, typename R, typename... IN>
__device__ typename std::enable_if<HasApply2<FunctorT>::value == true && pack_size % 2 == 0,
Packed<R, pack_size>>::type
ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
Packed<R, pack_size> ret;
#pragma unroll
for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in.elem + j)...); }
return ret;
}
template<int pack_size, typename FunctorT, typename R, typename... IN>
__device__ typename std::enable_if<HasApply2<FunctorT>::value == false || pack_size % 2 != 0,
Packed<R, pack_size>>::type
ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
Packed<R, pack_size> ret;
#pragma unroll
for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in.elem[j])...); }
return ret;
}
template<int pack_size, typename FactoryT, typename R, typename... IN>
__global__ void __launch_bounds__(kBlockSize)
ApplyGeneric(FactoryT factory, int64_t n_pack, Packed<R, pack_size>* pack_r,
const Packed<IN, pack_size>*... pack_in, int64_t n_tail, R* tail_r,
const IN*... tail_in) {
auto functor = factory();
const int global_tid = blockIdx.x * kBlockSize + threadIdx.x;
for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) {
pack_r[i] = ApplyPack<pack_size, decltype(functor), R, IN...>(functor, (pack_in[i])...);
}
if (global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); }
}
template<typename FunctorT>
struct SimpleFactory {
explicit SimpleFactory(FunctorT functor) : tpl(functor) {}
__device__ FunctorT operator()() const { return tpl; }
private:
FunctorT tpl;
};
template<size_t pack_size>
bool IsAlignedForPack() {
return true;
}
template<size_t pack_size, typename T, typename... Args>
bool IsAlignedForPack(const T* ptr, const Args*... others) {
return reinterpret_cast<uintptr_t>(ptr) % sizeof(Pack<T, pack_size>) == 0
&& IsAlignedForPack<pack_size, Args...>(others...);
}
template<size_t pack_size, typename FactoryT, typename R, typename... IN>
cudaError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in) {
const int64_t n_pack = n / pack_size;
const int64_t tail_offset = n_pack * pack_size;
const int64_t n_tail = n - tail_offset;
int num_blocks;
{
cudaError_t err = GetNumBlocks(n_pack, &num_blocks);
if (err != cudaSuccess) { return err; }
}
ApplyGeneric<pack_size, FactoryT, R, IN...><<<num_blocks, kBlockSize, 0>>>(
factory, n_pack, reinterpret_cast<Packed<R, pack_size>*>(r),
(reinterpret_cast<const Packed<IN, pack_size>*>(in))..., n_tail, r + tail_offset,
(in + tail_offset)...);
return cudaPeekAtLastError();
}
template<typename FactoryT, typename R, typename... IN>
struct GenericLauncher {
static cudaError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in) {
constexpr int max_pack_size = PackSize<R, IN...>();
if (IsAlignedForPack<max_pack_size, R, IN...>(r, in...)) {
return LaunchKernel<max_pack_size, FactoryT, R, IN...>(factory, n, r, in...);
} else {
return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in...);
}
}
};
template<typename FactoryT, typename R, typename A>
inline cudaError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a) {
return GenericLauncher<FactoryT, R, A>::Launch(factory, n, r, a);
}
template<typename FunctorT, typename R, typename A>
inline cudaError_t Unary(FunctorT functor, int64_t n, R* r, const A* a) {
return UnaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a);
}
template<typename FactoryT, typename R, typename A, typename B>
inline cudaError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b) {
return GenericLauncher<FactoryT, R, A, B>::Launch(factory, n, r, a, b);
}
template<typename FunctorT, typename R, typename A, typename B>
inline cudaError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b) {
return BinaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b);
}
template<typename FactoryT, typename R, typename A, typename B, typename C>
inline cudaError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
const C* c) {
return GenericLauncher<FactoryT, R, A, B, C>::Launch(factory, n, r, a, b, c);
}
template<typename FunctorT, typename R, typename A, typename B, typename C>
inline cudaError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c) {
return TernaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, c);
}
template<typename T>
struct MultiplyFunctor {
__device__ T operator()(T x, T y) const {
return x*y;
}
};
template<>
struct MultiplyFunctor<half> {
__device__ half operator()(half x, half y) const {
return x*y;
}
#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
__device__ void Apply2(half* z, const half* x, const half* y) const {
const half2 x2 = *(reinterpret_cast<const half2*>(x));
const half2 y2 = *(reinterpret_cast<const half2*>(y));
*reinterpret_cast<half2*>(z) = __hmul2(x2, y2);
}
#endif // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
};
fast atomic add
atomic add
-
main
int main(){ half *x_host = (half*)malloc(N*sizeof(half)); half *x_device; cudaMalloc((void **)&x_device, N*sizeof(half)); for (int i = 0; i < N; i++) x_host[i] = 0.1; cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice); half *y_host = (half*)malloc(N*sizeof(half)); half *y_device; cudaMalloc((void **)&y_device, N*sizeof(half)); for (int i = 0; i < N; i++) y_host[i] = 0.1; cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice); half *output_host = (half*)malloc(sizeof(half)); half *output_device; cudaMalloc((void **)&output_device, sizeof(half)); cudaMemset(&output_device, 0, sizeof(half)); int32_t block_num = (N + kBlockSize - 1) / kBlockSize; dim3 grid(block_num, 1); dim3 block(kBlockSize, 1); dot<<<grid, block>>>(x_device, y_device, output_device, N); cudaMemcpy(output_host, output_device, sizeof(half), cudaMemcpyDeviceToHost); printf("%.6f\n", static_cast<double>(output_host[0])); free(x_host); free(y_host); free(output_host); cudaFree(x_device); cudaFree(y_device); cudaFree(output_device); }
- cuda
__global__ void dot(half* a, half* b, half* c, int n){ const int nStep = gridDim.x * blockDim.x; half temp = 0.0; int gid = blockIdx.x * blockDim.x + threadIdx.x; while (gid < n) { temp = temp + a[gid] * b[gid]; gid += nStep; } atomicAdd(c, temp); }
half pack2
- half类型的原子加转换成half2类型的原子加
- main
int main(){ half *x_host = (half*)malloc(N*sizeof(half)); half *x_device; cudaMalloc((void **)&x_device, N*sizeof(half)); for (int i = 0; i < N; i++) x_host[i] = 1.0; cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice); Pack<half, 2>* x_pack = reinterpret_cast<Pack<half, 2>*>(x_device); half *y_host = (half*)malloc(N*sizeof(half)); half *y_device; cudaMalloc((void **)&y_device, N*sizeof(half)); for (int i = 0; i < N; i++) y_host[i] = 1.0; cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice); Pack<half, 2>* y_pack = reinterpret_cast<Pack<half, 2>*>(y_device); half *output_host = (half*)malloc(2 * sizeof(half)); half *output_device; cudaMalloc((void **)&output_device, 2 * sizeof(half)); cudaMemset(&output_device, 0, sizeof(half) * 2); Pack<half, 2>* output_pack = reinterpret_cast<Pack<half, 2>*>(output_device); int32_t block_num = (N + kBlockSize - 1) / kBlockSize; dim3 grid(block_num, 1); dim3 block(kBlockSize, 1); dot<half, 2><<<grid, block>>>(x_pack, y_pack, output_pack, N); cudaMemcpy(output_host, output_device, 2 * sizeof(half), cudaMemcpyDeviceToHost); printf("%.6f\n", static_cast<double>(output_host[0])); free(x_host); free(y_host); free(output_host); cudaFree(x_device); cudaFree(y_device); cudaFree(output_device); }
- pack2
template<typename T, size_t pack_size> struct alignas(sizeof(T) * pack_size) Pack { T elem[pack_size]; };
- dot
template<typename T, int32_t pack_size> __global__ void dot(Pack<T, pack_size>* a, Pack<T, pack_size>* b, Pack<T, pack_size>* c, int n){ const int nStep = gridDim.x * blockDim.x; T temp = 0.0; int gid = blockIdx.x * blockDim.x + threadIdx.x; while (gid < n / pack_size) { for (int i = 0; i < pack_size; i++) { temp = temp + a[gid].elem[i] * b[gid].elem[i]; } gid += nStep; } AtomicAdd<T, pack_size>(c, temp); }
- AtomicAdd template和特化版本
// 函数特化版本 template<> __device__ __inline__ void AtomicAdd<half, 2>(Pack<half, 2>* address, half val) { half2 h2_val; h2_val.x = static_cast<half>(val); h2_val.y = static_cast<half>(val); atomicAdd(reinterpret_cast<half2*>(address), h2_val); } template<typename T, int32_t pack_size> __device__ __inline__ void AtomicAdd(Pack<T, pack_size>* address, T val) { #pragma unroll for (int i = 0; i < pack_size; ++i) { atomicAdd(reinterpret_cast<T*>(address) + i, static_cast<T>(val)); } }
fast atomic add
-
main
int main(){ half *x_host = (half*)malloc(N*sizeof(half)); half *x_device; cudaMalloc((void **)&x_device, N*sizeof(half)); for (int i = 0; i < N; i++) x_host[i] = 0.1; cudaMemcpy(x_device, x_host, N*sizeof(half), cudaMemcpyHostToDevice); half *y_host = (half*)malloc(N*sizeof(half)); half *y_device; cudaMalloc((void **)&y_device, N*sizeof(half)); for (int i = 0; i < N; i++) y_host[i] = 0.1; cudaMemcpy(y_device, y_host, N*sizeof(half), cudaMemcpyHostToDevice); half *output_host = (half*)malloc(sizeof(half)); half *output_device; cudaMalloc((void **)&output_device, sizeof(half)); cudaMemset(&output_device, 0, sizeof(half)); int32_t block_num = (N + kBlockSize - 1) / kBlockSize; dim3 grid(block_num, 1); dim3 block(kBlockSize, 1); dot<<<grid, block>>>(x_device, y_device, output_device, N); cudaMemcpy(output_host, output_device, sizeof(half), cudaMemcpyDeviceToHost); printf("%.6f\n", static_cast<double>(output_host[0])); free(x_host); free(y_host); free(output_host); cudaFree(x_device); cudaFree(y_device); cudaFree(output_device); }
- dot
__global__ void dot(half* a, half* b, half* c, int n){ const int nStep = gridDim.x * blockDim.x; half temp = 0.0; int gid = blockIdx.x * blockDim.x + threadIdx.x; while (gid < n) { temp = temp + a[gid] * b[gid]; gid += nStep; } // atomicAdd(c, temp); FastAdd(c, 0, N, temp); }
- 实现
// FastAdd is referenced from // https://github.com/pytorch/pytorch/blob/396c3b1d88d7624938a2bb0b287f2a19f1e89bb4/aten/src/ATen/native/cuda/KernelUtils.cuh#L29 // 如果T类型是half类型 template<typename T, typename std::enable_if<std::is_same<half, T>::value>::type* = nullptr> __device__ __forceinline__ void FastSpecializedAtomicAdd(T* base, size_t offset, const size_t length, T value) { #if ((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) \ || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) atomicAdd(reinterpret_cast<half*>(base) + offset, static_cast<half>(value)); #else // Accounts for the chance base falls on an odd 16 bit alignment (ie, not 32 bit aligned) __half* target_addr = reinterpret_cast<__half*>(base + offset); bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0); if (low_byte && offset < (length - 1)) { __half2 value2; value2.x = value; value2.y = __float2half_rz(0); atomicAdd(reinterpret_cast<__half2*>(target_addr), value2); } else if (!low_byte && offset > 0) { __half2 value2; value2.x = __float2half_rz(0); value2.y = value; atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2); } else { atomicAdd(reinterpret_cast<__half*>(base) + offset, static_cast<__half>(value)); } #endif } // 如果T类型不是half类型 template<typename T, typename std::enable_if<!std::is_same<half, T>::value>::type* = nullptr> __device__ __forceinline__ void FastSpecializedAtomicAdd(T* base, size_t offset, const size_t length, T value) { atomicAdd(base + offset, value); } template<class T> __device__ __forceinline__ void FastAdd(T* base, size_t offset, const size_t length, T value) { FastSpecializedAtomicAdd(base, offset, length, value); }
unsample nearest 2D
实现
int main(){
float *input_host = (float*)malloc(N*sizeof(float));
float *input_device;
cudaMalloc((void **)&input_device, N*sizeof(float));
for (int i = 0; i < N; i++) input_host[i] = 1.0;
cudaMemcpy(input_device, input_host, N*sizeof(float), cudaMemcpyHostToDevice);
float *output_host = (float*)malloc(N * 4 * sizeof(float));
float *output_device;
cudaMalloc((void **)&output_device, N * 4 * sizeof(float));
dim3 grid(N / kBlockSize, 1);
dim3 block(kBlockSize, 1);
UpsampleNearest2D2XForward<<<grid, block>>>(N, input_device, 1024, 1024, output_device);
cudaMemcpy(output_host, output_device, N * 4 * sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0; i < 50; i++) {
printf("%.5f\n", output_host[i]);
}
return 0;
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
i += step)
template<typename T>
struct alignas(2 * sizeof(T)) Pack2X {
T x;
T y;
};
template<typename T>
__global__ void UpsampleNearest2D2XForward(const int32_t in_elem_cnt, const T* in_dptr,
const int32_t in_height, const int32_t in_width,
T* out_dptr) {
const int32_t in_hw_size = in_width * in_height;
CUDA_1D_KERNEL_LOOP(index, in_elem_cnt) {
const T in_value = in_dptr[index];
const int32_t nc_idx = index / in_hw_size;
const int32_t hw_off = index - nc_idx * in_hw_size;
const int32_t h = hw_off / in_width;
const int32_t w = hw_off - h * in_width;
Pack2X<T> out_value{in_value, in_value};
Pack2X<T>* out_pack_dptr = reinterpret_cast<Pack2X<T>*>(out_dptr);
out_pack_dptr[nc_idx * in_hw_size * 2 + h * 2 * in_width + w] = out_value;
out_pack_dptr[nc_idx * in_hw_size * 2 + (h * 2 + 1) * in_width + w] = out_value;
}
}
template<typename T>
__global__ void UpsampleNearest2D2XBackward(const int32_t in_elem_cnt, const T* dy_dptr,
const int32_t dx_height, const int32_t dx_width,
T* dx_dptr) {
const int32_t dx_hw_size = dx_height * dx_width;
CUDA_1D_KERNEL_LOOP(index, in_elem_cnt) {
T dx_value = 0.0;
const int32_t nc_idx = index / dx_hw_size;
const int32_t dx_hw_off = index - nc_idx * dx_hw_size;
const int32_t dx_h = dx_hw_off / dx_width;
const int32_t dx_w = dx_hw_off - dx_h * dx_width;
const Pack2X<T>* dy_pack_dptr = reinterpret_cast<const Pack2X<T>*>(dy_dptr);
const Pack2X<T> dy_pack_value1 =
dy_pack_dptr[nc_idx * dx_hw_size * 2 + dx_h * 2 * dx_width + dx_w];
const Pack2X<T> dy_pack_value2 =
dy_pack_dptr[nc_idx * dx_hw_size * 2 + (dx_h * 2 + 1) * dx_width + dx_w];
dx_value += dy_pack_value1.x;
dx_value += dy_pack_value1.y;
dx_value += dy_pack_value2.x;
dx_value += dy_pack_value2.y;
dx_dptr[index] = dx_value;
}
}
indexing
算法
template<typename T, typename IndexT>
__global__ void index_add_cuda_kernel(const int64_t n, const T* input, const IndexT* index,
const T* source, T* output, const int64_t stride,
const int64_t source_dim, const int64_t delta,
const float alpha) {
// For x = flow.ones(5, 3)
// source = flow.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=flow.float)
// index = flow.tensor([0, 4, 2])
// dim = 0
// We have:
// stride = 3
// source_dim = 3
// stride * source_dim = 9
// alpha = 1.0
// delta = 5 - 3 = 2
// For i = 8
// pre_index = i / stride_source_dim = 8 / 9 = 0
// dim_index = i % stride_source_dim / stride = 8 % 9 / 3 = 0
// source_dim_idx = index[dim_index] = index[0] = 0
// output_index = i + (delta * pre_index + source_dim_idx - dim_index) * stride = 9 + (2 * 0 + 0 -
// 0) * 3 = 9 cuda::atomic::Add(output + output_index, static_cast<T>(alpha) * source[i])=>
// output[9] += 1.0 * 9 = 10.0
const int64_t stride_source_dim = stride * source_dim;
CUDA_1D_KERNEL_LOOP(i, n) {
int64_t pre_index = i / stride_source_dim;
int64_t dim_index = (i - pre_index * stride_source_dim) / stride;
IndexT source_dim_idx = index[dim_index];
int64_t output_index = i + (delta * pre_index + source_dim_idx - dim_index) * stride;
cuda::atomic::Add(output + output_index, static_cast<T>(alpha) * source[i]);
}
}
使用宏定义
#define REGISTER_INDEX_ADD_CUDA_KERNEL(dtype) \
REGISTER_USER_KERNEL("index_add") \
.SetCreateFn<IndexAddGpuKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("output", 0) == GetDataType<dtype>::value));
REGISTER_INDEX_ADD_CUDA_KERNEL(float)
REGISTER_INDEX_ADD_CUDA_KERNEL(half)
REGISTER_INDEX_ADD_CUDA_KERNEL(double)
接口定义
template<typename T>
class IndexAddGpuKernel final : public user_op::OpKernel {
public:
IndexAddGpuKernel() = default;
~IndexAddGpuKernel() = default;
private:
using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
const user_op::Tensor* index = ctx->Tensor4ArgNameAndIndex("index", 0);
const user_op::Tensor* source = ctx->Tensor4ArgNameAndIndex("source", 0);
user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
const int64_t dim = ctx->Attr<int64_t>("dim");
const float alpha = ctx->Attr<float>("alpha");
const ShapeView& input_shape = input->shape_view();
const ShapeView& source_shape = source->shape_view();
std::vector<int64_t> input_stride(input->stride().begin(), input->stride().end());
const int64_t stride = input_stride[dim];
const int64_t source_dim = source_shape.At(dim);
const int64_t delta = input_shape.At(dim) - source_dim;
DataType index_dtype = index->data_type();
const int32_t n = source->shape_view().elem_cnt();
Memcpy<DeviceType::kCUDA>(
ctx->stream(), output->mut_dptr<void>(), input->dptr<void>(),
input->shape_view().elem_cnt() * GetSizeOfDataType(input->data_type()));
if (GetSizeOfDataType(index_dtype) == 4) {
RUN_CUDA_KERNEL((index_add_cuda_kernel<T, int32_t>), ctx->stream(), n, n, input->dptr<T>(),
index->dptr<int32_t>(), source->dptr<T>(), output->mut_dptr<T>(), stride,
source_dim, delta, alpha);
} else {
RUN_CUDA_KERNEL((index_add_cuda_kernel<T, int64_t>), ctx->stream(), n, n, input->dptr<T>(),
index->dptr<int64_t>(), source->dptr<T>(), output->mut_dptr<T>(), stride,
source_dim, delta, alpha);
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
convolve
-
main
int main(int argc, char const *argv[]) { thrust::host_vector<float> h_a(N_A); thrust::device_vector<float> d_a(N_A); thrust::host_vector<float> h_b(N_B); thrust::device_vector<float> d_b(N_B); size_t N = h_a.size() + h_b.size() - 1; size_t L = pow( 2, static_cast<int>(log2(N - 1)) + 1 ); thrust::host_vector<float> h_result(N); thrust::device_vector<float> d_result(N); thrust::sequence(h_a.begin(), h_a.end()); // thrust::sequence(h_b.begin(), h_b.end()); thrust::sequence(h_b.rbegin(), h_b.rend()); // get raw pointer for kernel float *raw_point_a = thrust::raw_pointer_cast( &d_a[0] ); float *raw_point_b = thrust::raw_pointer_cast( &d_b[0] ); float *raw_point_result = thrust::raw_pointer_cast( &d_result[0] ); int numThreads = NUM_THREADS; int numBlocks = (L + numThreads - 1) / numThreads; cudaEvent_t start, stop; checkCudaErrors( cudaEventCreate(&start) ); checkCudaErrors( cudaEventCreate(&stop) ); cudaEventRecord(start); // copy a b to device thrust::copy(h_a.begin(), h_a.end(), d_a.begin()); thrust::copy(h_b.begin(), h_b.end(), d_b.begin()); // conv(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, L, numBlocks, numThreads); conv2(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, N, numBlocks, numThreads); thrust::copy(d_result.begin(), d_result.end(), h_result.begin()); cudaEventRecord(stop); checkCudaErrors( cudaThreadSynchronize() ); float time = 0; cudaEventElapsedTime(&time, start, stop); cout << "run times: " << time << " ms " << endl; // for (thrust::host_vector<float>::iterator i = h_result.begin(); i != h_result.end(); ++i) // { // cout << *i << "\t"; // } cout << endl; return 0; }
-
卷积实现
void conv(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out, size_t numBlocks, size_t numThreads) { conv_kernel<<<numBlocks, numThreads>>>(ina, inb, out, len_a, len_b, len_out); } // Direct calculation convolution __global__ void conv_kernel(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= len_out) { return; } float sum = 0.0f; for (int m = 0; m < len_b; ++m) { int k = tid - m; if (0 <= k && k < len_a) { sum += ina[k] * inb[m]; } } out[tid] = sum; }
-
shared memory优化
void conv2(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out, size_t numBlocks, size_t numThreads) { cudaMemcpyToSymbol(c_b, inb, len_b * sizeof(float)); size_t sharedSize = numThreads * sizeof(float); conv3_kernel<<<numBlocks, numThreads, sharedSize>>>(ina, out, len_a, len_b, len_out); }