GEMM优化

 

各种实现GEMM的方法

    #define CEIL_DIV(m,n) ( (m) + (n) - 1 ) / (n)
    // 函数调用
    void test_mysgemm_v1(INT M, INT N, INT K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT beta, FLOAT *C){
        cudaDeviceSynchronize();
        dim3 blockDim(32,32);
        // 由block大小32,确定grid大小,利用M+32-1/32并舍弃小数点得到
        dim3 gridDim(CEIL_DIV(M,32),CEIL_DIV(N,32));
        // 其中A(M*K)、B(K*N)、C(M*N)各个矩阵都为列存储
        mysgemm_v1<<<gridDim, blockDim>>>(M,N,K,alpha,A,B,beta,C);
        cudaDeviceSynchronize();
    }

Version 1: 未优化

  • 只是使用GPU作为并发处理方式,但读取都是直接通过L2 cache、global memory,而非GPU片上shared memory
  • 在对GPU读取过程中,对B的内存会因为有conflict而有多次transactions

  #include<stdio.h>
  #include<stdlib.h>
  // A、B、C为列主序矩阵, #define A(i,j) A[(i)+(j)*lda]管理A(i,j)访问与列主序存储的映射
  #define A(i,j) A[(i) + (j)*lda]
  #define B(i,j) B[(i) + (j)*ldb]
  #define C(i,j) C[(i) + (j)*ldc]
  // naive version
  // 限制一个block的threads个数为1024
  __global__  __launch_bounds__(1024)
  // __launch_bounds__(1024) 限制每个SM的最多线程数
  void mysgemm_v1(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
      //在一个block内的处理
      int lda = M, ldb = K, ldc = M;
      int tx = threadIdx.x, ty = threadIdx.y;
      int bx = blockIdx.x, by = blockIdx.y;
      // 确定block的位置
      A = &A((bx<<5),0);
      B = &B(0,(by<<5));
      C = &C((bx<<5),(by<<5));
      float tmp=0.;
      for (int k_count = 0; k_count<K; k_count++){
          // 对列主序来说,读A更快,当tx变化,对应A的内存可以合并访问,而B的内存回触发多次transaction
          tmp += A(tx, k_count) * B(k_count, ty);
      }
      C(tx,ty) = alpha * tmp + beta*C(tx,ty);
  }

Version 2: 加入shared memory(未优化shared memory的bank conflict)

  • 在对tmp的优化中,因为sasb的数据排布问题导致对shared memory的读取速度变慢
  #include<stdio.h>
  #include<stdlib.h>
  #define A(i,j) A[(i) + (j)*lda]
  #define B(i,j) B[(i) + (j)*ldb]
  #define C(i,j) C[(i) + (j)*ldc]
  #define sa(i,j) sa[((i)<<5) + (j)]
  #define sb(i,j) sb[((i)<<5) + (j)]
  #define MS 32
  #define NS 32
  #define KS 32
  // cache blocking version, without register-level data re-use
  __global__  __launch_bounds__(1024)
  void mysgemm_v2(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
      int lda = M, ldb = K, ldc = M;
      int tx = threadIdx.x, ty = threadIdx.y;
      int bx = blockIdx.x, by = blockIdx.y;
      A = &A((bx<<5),0);
      B = &B(0,(by<<5));
      C = &C((bx<<5),(by<<5));
      // 加入shared memory,大小为32*32
      __shared__ float sa[MS*KS];
      __shared__ float sb[KS*NS];
      float tmp=0.;
      for (int k_count = 0; k_count<K; k_count+=KS){
          // 为了防止bank conflict将,sb设置为转置
          sa(tx,ty)=A(tx,ty);
          sb(ty,tx)=B(tx,ty);
          A+=(lda<<5);B+=32;
          __syncthreads();
          for (int inner_k_count=0;inner_k_count<KS;inner_k_count++){
              // 看是否出现shared memory的bank conflict问题,变化的是tx当tx变化,对应到sa空间的变化为1024大小的变化,会产生bank conflicts,优化是是将shared memory存储转化为列主序,见kenrel4
              tmp += sa(tx,inner_k_count) * sb(ty,inner_k_count);
          }
          __syncthreads();
      }
      C(tx,ty) = alpha * tmp + beta*C(tx,ty);
  }

Version 3: threadId设为register存储

  • 将local/register存储转为register存储。register变量变化速度在在一个时钟周期可以达到多次
  void test_mysgemm_v3(INT M, INT N, INT K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT beta, FLOAT *C){
      cudaDeviceSynchronize();
      dim3 blockDim(1024);
      dim3 gridDim(CEIL_DIV(M,32),CEIL_DIV(N,32));
      mysgemm_v3<<<gridDim, blockDim>>>(M,N,K,alpha,A,B,beta,C);
      cudaDeviceSynchronize();
  }
  #include<stdio.h>
  #include<stdlib.h>
  #define A(i,j) A[(i) + (j)*lda]
  #define B(i,j) B[(i) + (j)*ldb]
  #define C(i,j) C[(i) + (j)*ldc]
  #define sa(i,j) sa[((i)<<5) + (j)]
  #define sb(i,j) sb[((i)<<5) + (j)]
  #define MS 32
  #define NS 32
  #define KS 32
  // cache blocking version, without register-level data re-use
  // save one living register ty.
  __global__  __launch_bounds__(1024)
  void mysgemm_v3(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){

    // 为local存储
      int lda = M, ldb = K, ldc = M;
      int tx = threadIdx.x;
      int bx = blockIdx.x, by = blockIdx.y;
      // 利用register进行优化,row和col分别位对应shared memory对应的位置,其中在<<<grid, block>>>中,block直接设置为1维度,row和col用于存储当前行和当前列(thread层面)。tx的前31表示对应的row,tx/32表示对应的col

      // 为register存储
      int row = tx&31, col = tx>>5;
      // 定位当前block的thread
      A = &A((bx<<5),0);
      B = &B(0,(by<<5));
      C = &C((bx<<5),(by<<5));
      __shared__ float sa[MS*KS];
      __shared__ float sb[KS*NS];
      // 为local存储
      float tmp=0.;
      for (int k_count = 0; k_count<K; k_count+=KS){
          sa(row,col)=A(row,col);
          sb(col,row)=B(row,col);
          A+=(lda<<5);B+=32;
          __syncthreads();
          for (int inner_k_count=0;inner_k_count<KS;inner_k_count++){

              tmp += sa(row,inner_k_count) * sb(col,inner_k_count);
          }
          __syncthreads();
      }
      C(row,col) = alpha * tmp + beta*C(row,col);
  }

Version 4: 去除shared memory的bank conflict

  • 更换shared memory的数据存储格式,取出bank conflict
  #include<stdio.h>
  #include<stdlib.h>
  #define A(i,j) A[(i) + (j)*lda]
  #define B(i,j) B[(i) + (j)*ldb]
  #define C(i,j) C[(i) + (j)*ldc]
  #define sa4(i,j) sa4[((j)<<5) + (i)]
  #define sb4(i,j) sb4[((j)<<5) + (i)]
  #define MS 32
  #define NS 32
  #define KS 32
  // cache blocking version, without register-level data re-use
  // with memory coelascing on shared memory
  __global__  __launch_bounds__(1024)
  void mysgemm_v4(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
      int lda = M, ldb = K, ldc = M;
      int tx = threadIdx.x;
      int bx = blockIdx.x, by = blockIdx.y;
      int row = tx&31, col = tx>>5;
      A = &A((bx<<5),0);
      B = &B(0,(by<<5));
      C = &C((bx<<5),(by<<5));
      __shared__ float sa4[MS*KS];
      __shared__ float sb4[KS*NS];
      float tmp=0.;
      for (int k_count = 0; k_count<K; k_count+=KS){
          sa4(row,col)=A(row,col);
          sb4(col,row)=B(row,col);
          A+=(lda<<5);B+=32;
          __syncthreads();
          // 将shared memory转换为列存储,解决了shared memory中存在的bank conflict问题,当row变化时候,对应存储在shared memory为顺序结构
          for (int inner_k_count=0;inner_k_count<KS;inner_k_count++){
              tmp += sa4(row,inner_k_count) * sb4(col,inner_k_count);
          }
          __syncthreads();
      }
      C(row,col) = alpha * tmp + beta*C(row,col);
  }

Version 5: 设立 4*1 micro-kernel

  • 为了解决因为core占用较高导致warp执行数目较少的问题,将每个core的执行内容进行增加即,即每个负责计算4个单元的值
  void test_mysgemm_v5(INT M, INT N, INT K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT beta, FLOAT *C){
      cudaDeviceSynchronize();
      dim3 blockDim(256);
      dim3 gridDim(CEIL_DIV(M,32),CEIL_DIV(N,32));
      mysgemm_v5<<<gridDim, blockDim>>>(M,N,K,alpha,A,B,beta,C);
      cudaDeviceSynchronize();
  }
  #include<stdio.h>
  #include<stdlib.h>
  // A、B、C为列主序矩阵
  #define A(i,j) A[(i) + (j)*lda]
  #define B(i,j) B[(i) + (j)*ldb]
  #define C(i,j) C[(i) + (j)*ldc]
  // shared memory也为列主序矩阵
  #define sa5(i,j) sa5[((j)<<5) + (i)]
  #define sb5(i,j) sb5[((j)<<5) + (i)]
  #define MS 32
  #define NS 32
  #define KS 32
  // cache blocking version, without register-level data re-use
  // with memory coelascing on shared memory
  // more workloads per thread. 4x1 micro kernel.
  // kenrel前面数目为1024,因为SM对threads的数目有限制,为了提高SM的block数目,一个thread处理4个数据,减少每个block的threads需求数目,提高SM的blocks数目
  __global__  __launch_bounds__(256)
  void mysgemm_v5(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
      int lda = M, ldb = K, ldc = M;
      int tx = threadIdx.x;
      int bx = blockIdx.x, by = blockIdx.y;
      // 确定当前行和当前列,其中shared memory为32*32,利用下述分析,其中每个kernel负责4*1的矩阵计算,需要4个row、1个col,利用32/4 = 8
      // 例如对tx= 0000和tx= 0100应得到相同的row和col
      // tx & 0111得到对应的index,0000和0100,利用对应index相同得到<<2
      // 0000 & 0111 = 000 <<2 = 0 row2 = 1 row3 = 2 row4 = 3
      // 0001 & 0111 = 001 <<2 = 100 4 row2 = 101 5 row3 = 110 6 tow4 = 111 7
      int row1 = (tx&7)<<2, row2 = row1+1, row3 = row1+2, row4 = row1+3, col = tx>>3;
      // 找到每个block的对应起始位置(C(i,j)表示A(i:)与B(:j)乘积的和)
      A = &A((bx<<5),0);
      B = &B(0,(by<<5));
      C = &C((bx<<5),(by<<5));
      // shared memory保存每个block的数据
      __shared__ float sa5[MS*KS];
      __shared__ float sb5[KS*NS];
      // 每个thread负责4个的计算
      float Cres[4] = {0., 0., 0., 0.};
      float b00;
      // 将shared memory的数据保存到
      for (int k_count = 0; k_count<K; k_count+=KS){
          sa5(row1,col)=A(row1,col);
          sa5(row2,col)=A(row2,col);
          sa5(row3,col)=A(row3,col);
          sa5(row4,col)=A(row4,col);
          sb5(col,row1)=B(row1,col);
          sb5(col,row2)=B(row2,col);
          sb5(col,row3)=B(row3,col);
          sb5(col,row4)=B(row4,col);
          A+=(lda<<5);B+=32;
          __syncthreads();
          #pragma unroll
          // 对每个block中进行计算
          for (int inner_k_count=0;inner_k_count<KS;inner_k_count++){
              b00 = sb5(col,inner_k_count);
              Cres[0] += sa5(row1,inner_k_count) * b00;
              Cres[1] += sa5(row2,inner_k_count) * b00;
              Cres[2] += sa5(row3,inner_k_count) * b00;
              Cres[3] += sa5(row4,inner_k_count) * b00;
          }
          __syncthreads();
      }
      C(row1,col) = alpha * Cres[0] + beta*C(row1,col);
      C(row2,col) = alpha * Cres[1] + beta*C(row2,col);
      C(row3,col) = alpha * Cres[2] + beta*C(row3,col);
      C(row4,col) = alpha * Cres[3] + beta*C(row4,col);
  }

Version 6: 采用float4数据格式用于数据传输

#include<stdio.h>
#include<stdlib.h>
#define A(i,j) A[(i) + (j)*lda]
#define B(i,j) B[(i) + (j)*ldb]
#define C(i,j) C[(i) + (j)*ldc]
#define sa6(i,j) sa6[((j)<<5) + (i)]
#define sb6(i,j) sb6[((j)<<5) + (i)]
#define MS 32
#define NS 32
#define KS 32
// cache blocking version, without register-level data re-used
// with memory coelascing on shared memory
// more workloads per thread. 4x1 micro kernel.
// adopt vetorized load/store
__global__  __launch_bounds__(256)
void mysgemm_v6(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
    int lda = M, ldb = K, ldc = M;
    int tx = threadIdx.x;
    int bx = blockIdx.x, by = blockIdx.y;
    int row1 = (tx&7)<<2, row2 = row1+1, row3 = row1+2, row4 = row1+3, col = tx>>3;
    A = &A((bx<<5),0);
    B = &B(0,(by<<5));
    C = &C((bx<<5),(by<<5));
    __shared__ float sa6[MS*KS];
    __shared__ float sb6[KS*NS];
    // 利用支持128 transaction(即4 floats)的硬件特点使用float4来做数据的传输
    float4 Av, Bv, Cv, Cres;
    Cres.x = 0., Cres.y = 0., Cres.z = 0., Cres.w = 0.;
    float b00;
    for (int k_count = 0; k_count<K; k_count+=KS){
        // 从global memory到shared memory传输的单位为float4
        Av = *((float4 *)(&A(row1,col)));
        Bv = *((float4 *)(&B(row1,col)));
        ((float4 *)sa6)[tx] = Av;
        sb6(col,row1)=Bv.x;
        sb6(col,row2)=Bv.y;
        sb6(col,row3)=Bv.z;
        sb6(col,row4)=Bv.w;
        A+=(lda<<5);B+=32;
        __syncthreads();
        #pragma unroll
        for (int inner_k_count=0;inner_k_count<KS;inner_k_count++){
            b00 = sb6(col, inner_k_count);
            Cres.x += sa6(row1,inner_k_count) * b00;
            Cres.y += sa6(row2,inner_k_count) * b00;
            Cres.z += sa6(row3,inner_k_count) * b00;
            Cres.w += sa6(row4,inner_k_count) * b00;
        }
        __syncthreads();
    }
    // 从global memory读使用128位读的transaction,写因为硬件限制只能使用32位,在此处使用float4同样进行封装
    Cv = *((float4 *)(&C(row1,col)));
    Cres.x = alpha * Cres.x + beta * Cv.x;
    Cres.y = alpha * Cres.y + beta * Cv.y;
    Cres.z = alpha * Cres.z + beta * Cv.z;
    Cres.w = alpha * Cres.w + beta * Cv.w;
    *(float4 *)(&(C(row1,col))) = Cres;
}

Version 7: 设立 4*4 micro-kernel

    void test_mysgemm_v7(INT M, INT N, INT K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT beta, FLOAT *C){
      cudaDeviceSynchronize();
      dim3 blockDim(256);
      // 使用64*64的阵,其中每个block计算4*4阵,对应有256threads
      dim3 gridDim(CEIL_DIV(M,64),CEIL_DIV(N,64));
      mysgemm_v7<<<gridDim, blockDim>>>(M,N,K,alpha,A,B,beta,C);
      cudaDeviceSynchronize();
  }
  #include<stdio.h>
  #include<stdlib.h>
  #define A(i,j) A[(i) + (j)*lda]
  #define B(i,j) B[(i) + (j)*ldb]
  #define C(i,j) C[(i) + (j)*ldc]
  #define sa7(i,j) sa7[((j)<<6) + (i)]
  #define sb7(i,j) sb7[((j)<<6) + (i)]
  #define MS_7 64
  #define NS_7 64
  #define KS_7 16
  //v1 += v2 * s3, vector scaling
  #define vscal(v1, v2, s3)\
    v1.x+=v2.x*s3;\
    v1.y+=v2.y*s3;\
    v1.z+=v2.z*s3;\
    v1.w+=v2.w*s3;
  //v1 = alpha * v2 + beta * v3, simd fma
  #define simd_axpby(v1, alpha, v2, beta, v3)\
    v1.x=alpha*v2.x+beta*v3.x;\
    v1.y=alpha*v2.y+beta*v3.y;\
    v1.z=alpha*v2.z+beta*v3.z;\
    v1.w=alpha*v2.w+beta*v3.w;
  #define vload(v1,addr)\
    v1 = *((float4 *)(addr));
  #define vstore(addr,v1)\
    *((float4 *)(addr)) = v1;
  // cache blocking version, without register-level data re-use
  // with memory coelascing on shared memory
  // more workloads per thread. 4x4 micro kernel.
  // adopt vetorized load/store
  // 使用64*64的阵,其中每个block计算4*4阵,对应有256threads.
  __global__  __launch_bounds__(256)
  void mysgemm_v7(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
    int lda = M, ldb = K, ldc = M;
    int tx = threadIdx.x;
    int bx = blockIdx.x, by = blockIdx.y;
    // 参数如何思考:首先data阵的大小为64*64,每个thread处理4*4,一共256个threads,其中A矩阵为64*16 B为16*64,对应的到A的threads阵为16*x、B为4*y,因此cola为>>4、row为tx&15,colb为>>2、row为tx&3,由每个threads处理4个row得到rowa和rowb<<2
    int row_a = (tx&15)<<2, col_a = tx>>4;
    // 对列的处理以4个为基本单位,如对64*64的阵来说,每个thread处理的分为每4个一块的col
    int row_b = (tx&3)<<2, col_b = tx>>2;
    // 得到所求矩阵对应thread内部列
    int col_c = col_a<<2;
    int lda16 = lda<<4;
    A = &A((bx<<6),0);                         
    B = &B(0,(by<<6));
    C = &C((bx<<6),(by<<6));//the TB size is 64.
    __shared__ float sa7[1024];
    __shared__ float sb7[1024];
    float4 Av, Bv, Cv[4], Cres[4];
    memset(Cres, 0, sizeof(Cres));
    // 保存shared memory
    for (int k_count = 0; k_count<K; k_count+=KS_7){
        vload(Av, &A(row_a,col_a))
        vload(Bv, &B(row_b,col_b))
        ((float4 *)sa7)[tx] = Av;
        sb7(col_b,row_b)=Bv.x;
        sb7(col_b,row_b+1)=Bv.y;
        sb7(col_b,row_b+2)=Bv.z;
        sb7(col_b,row_b+3)=Bv.w;
        A+=lda16;B+=16;
        __syncthreads();
        #pragma unroll
        // 计算矩阵乘法
        for (int inner_k_count=0;inner_k_count<KS_7;inner_k_count++){
            vload(Av, &sa7(row_a,inner_k_count))
            vload(Bv, &sb7(col_c,inner_k_count))
            vscal(Cres[0], Av, Bv.x)
            vscal(Cres[1], Av, Bv.y)
            vscal(Cres[2], Av, Bv.z)
            vscal(Cres[3], Av, Bv.w)
        }
        __syncthreads();
    }
    vload(Cv[0], &C(row_a,col_c))
    vload(Cv[1], &C(row_a,col_c+1))
    vload(Cv[2], &C(row_a,col_c+2))
    vload(Cv[3], &C(row_a,col_c+3))
    simd_axpby(Cres[0],alpha,Cres[0],beta,Cv[0])
    simd_axpby(Cres[1],alpha,Cres[1],beta,Cv[1])
    simd_axpby(Cres[2],alpha,Cres[2],beta,Cv[2])
    simd_axpby(Cres[3],alpha,Cres[3],beta,Cv[3])

    // 保存
    vstore(&C(row_a,col_c), Cres[0])
    vstore(&C(row_a,col_c+1), Cres[1])
    vstore(&C(row_a,col_c+2), Cres[2])
    vstore(&C(row_a,col_c+3), Cres[3])
  }

version 8:设立8*8 micro-kernel

  void test_mysgemm_v8(INT M, INT N, INT K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT beta, FLOAT *C){
    cudaDeviceSynchronize();
    dim3 blockDim(256);
    // 使用128*128的阵,其中每个block计算8*8阵,对应有256threads
    dim3 gridDim(CEIL_DIV(M,128),CEIL_DIV(N,128));
    mysgemm_v8<<<gridDim, blockDim>>>(M,N,K,alpha,A,B,beta,C);
    cudaDeviceSynchronize();
  }
  #include<stdio.h>
#include<stdlib.h>
#define A(i,j) A[(i) + (j)*lda]
#define B(i,j) B[(i) + (j)*ldb]
#define C(i,j) C[(i) + (j)*ldc]
#define sa8(i,j) sa8[((j)<<7) + (i)]
#define sb8(i,j) sb8[((j)<<7) + (i)]
#define MS_8 128
#define NS_8 128
#define KS_8 8
//v1 += v2 * s3, vector scaling
#define vscal(v1, v2, s3)\
    v1.x+=v2.x*s3;\
    v1.y+=v2.y*s3;\
    v1.z+=v2.z*s3;\
    v1.w+=v2.w*s3;
//v1 = alpha * v2 + beta * v3, simd fma
#define simd_axpby(v1, alpha, v2, beta, v3)\
    v1.x=alpha*v2.x+beta*v3.x;\
    v1.y=alpha*v2.y+beta*v3.y;\
    v1.z=alpha*v2.z+beta*v3.z;\
    v1.w=alpha*v2.w+beta*v3.w;
#define vload(v1,addr)\
    v1 = *((float4 *)(addr));
#define vstore(addr,v1)\
    *((float4 *)(addr)) = v1;
// cache blocking version, without register-level data re-use
// with memory coelascing on shared memory
// more workloads per thread. 8x8 micro kernel.
// adopt vetorized load/store
__global__  __launch_bounds__(256)
void mysgemm_v8(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
    int lda = M, ldb = K, ldc = M;
    int tx = threadIdx.x;
    int bx = blockIdx.x, by = blockIdx.y;
    int row_a = (tx&31)<<2, col_a = tx>>5;
    int row_b = (tx&1)<<2, col_b = tx>>1;
    int lda8 = lda<<3;
    int row_c = (tx&15)<<3, col_c = (tx>>4)<<3;
    A = &A((bx<<7),0);
    B = &B(0,(by<<7));
    C = &C((bx<<7),(by<<7));//the TB size is 128.
    __shared__ float sa8[1024];
    __shared__ float sb8[1024];
    float4 Av1, Av2, Bv1, Bv2, Cv[16], Cres[16];
    memset(Cres, 0, sizeof(Cres));//clear registers
    for (int k_count = 0; k_count<K; k_count+=KS_8){
        vload(Av1, &A(row_a,col_a))
        vload(Bv1, &B(row_b,col_b))
        ((float4 *)sa8)[tx] = Av1;
        sb8(col_b,row_b)=Bv1.x;
        sb8(col_b,row_b+1)=Bv1.y;
        sb8(col_b,row_b+2)=Bv1.z;
        sb8(col_b,row_b+3)=Bv1.w;
        A+=lda8;B+=8;
        __syncthreads();
        #pragma unroll
        for (int inner_k_count=0;inner_k_count<KS_8;inner_k_count++){
            vload(Av1, &sa8(row_c,inner_k_count))
            vload(Av2, &sa8(row_c+4,inner_k_count))
            vload(Bv1, &sb8(col_c,inner_k_count))
            vload(Bv2, &sb8(col_c+4,inner_k_count))
            vscal(Cres[0], Av1, Bv1.x)
            vscal(Cres[1], Av2, Bv1.x)
            vscal(Cres[2], Av1, Bv1.y)
            vscal(Cres[3], Av2, Bv1.y)
            vscal(Cres[4], Av1, Bv1.z)
            vscal(Cres[5], Av2, Bv1.z)
            vscal(Cres[6], Av1, Bv1.w)
            vscal(Cres[7], Av2, Bv1.w)
            vscal(Cres[8], Av1, Bv2.x)
            vscal(Cres[9], Av2, Bv2.x)
            vscal(Cres[10], Av1, Bv2.y)
            vscal(Cres[11], Av2, Bv2.y)
            vscal(Cres[12], Av1, Bv2.z)
            vscal(Cres[13], Av2, Bv2.z)
            vscal(Cres[14], Av1, Bv2.w)
            vscal(Cres[15], Av2, Bv2.w)
        }
        __syncthreads();
    }
    vload(Cv[0], &C(row_c,col_c))
    vload(Cv[1], &C(row_c+4,col_c))
    vload(Cv[2], &C(row_c,col_c+1))
    vload(Cv[3], &C(row_c+4,col_c+1))
    vload(Cv[4], &C(row_c,col_c+2))
    vload(Cv[5], &C(row_c+4,col_c+2))
    vload(Cv[6], &C(row_c,col_c+3))
    vload(Cv[7], &C(row_c+4,col_c+3))
    vload(Cv[8], &C(row_c,col_c+4))
    vload(Cv[9], &C(row_c+4,col_c+4))
    vload(Cv[10], &C(row_c,col_c+5))
    vload(Cv[11], &C(row_c+4,col_c+5))
    vload(Cv[12], &C(row_c,col_c+6))
    vload(Cv[13], &C(row_c+4,col_c+6))
    vload(Cv[14], &C(row_c,col_c+7))
    vload(Cv[15], &C(row_c+4,col_c+7))
    
    simd_axpby(Cres[0],alpha,Cres[0],beta,Cv[0])
    simd_axpby(Cres[1],alpha,Cres[1],beta,Cv[1])
    simd_axpby(Cres[2],alpha,Cres[2],beta,Cv[2])
    simd_axpby(Cres[3],alpha,Cres[3],beta,Cv[3])

    simd_axpby(Cres[4],alpha,Cres[4],beta,Cv[4])
    simd_axpby(Cres[5],alpha,Cres[5],beta,Cv[5])
    simd_axpby(Cres[6],alpha,Cres[6],beta,Cv[6])
    simd_axpby(Cres[7],alpha,Cres[7],beta,Cv[7])

    simd_axpby(Cres[8],alpha,Cres[8],beta,Cv[8])
    simd_axpby(Cres[9],alpha,Cres[9],beta,Cv[9])
    simd_axpby(Cres[10],alpha,Cres[10],beta,Cv[10])
    simd_axpby(Cres[11],alpha,Cres[11],beta,Cv[11])

    simd_axpby(Cres[12],alpha,Cres[12],beta,Cv[12])
    simd_axpby(Cres[13],alpha,Cres[13],beta,Cv[13])
    simd_axpby(Cres[14],alpha,Cres[14],beta,Cv[14])
    simd_axpby(Cres[15],alpha,Cres[15],beta,Cv[15])

    vstore(&C(row_c,col_c), Cres[0])
    vstore(&C(row_c+4,col_c), Cres[1])
    vstore(&C(row_c,col_c+1), Cres[2])
    vstore(&C(row_c+4,col_c+1), Cres[3])
    vstore(&C(row_c,col_c+2), Cres[4])
    vstore(&C(row_c+4,col_c+2), Cres[5])
    vstore(&C(row_c,col_c+3), Cres[6])
    vstore(&C(row_c+4,col_c+3), Cres[7])
    vstore(&C(row_c,col_c+4), Cres[8])
    vstore(&C(row_c+4,col_c+4), Cres[9])
    vstore(&C(row_c,col_c+5), Cres[10])
    vstore(&C(row_c+4,col_c+5), Cres[11])
    vstore(&C(row_c,col_c+6), Cres[12])
    vstore(&C(row_c+4,col_c+6), Cres[13])
    vstore(&C(row_c,col_c+7), Cres[14])
    vstore(&C(row_c+4,col_c+7), Cres[15])
}
#include<stdio.h>
#include<stdlib.h>
#define A(i,j) A[(i) + (j)*lda]
#define B(i,j) B[(i) + (j)*ldb]
#define C(i,j) C[(i) + (j)*ldc]
#define sa9(i,j) sa9[((j)<<7) + (i)]
#define sb9(i,j) sb9[((j)<<7) + (i)]
#define MS_9 128
#define NS_9 128
#define KS_9 8
//v1 += v2 * s3, vector scaling
#define vscal(v1, v2, s3)\
    v1.x+=v2.x*s3;\
    v1.y+=v2.y*s3;\
    v1.z+=v2.z*s3;\
    v1.w+=v2.w*s3;
//v1 = alpha * v2 + beta * v3, simd fma
#define simd_axpby(v1, alpha, v2, beta, v3)\
    v1.x=alpha*v2.x+beta*v3.x;\
    v1.y=alpha*v2.y+beta*v3.y;\
    v1.z=alpha*v2.z+beta*v3.z;\
    v1.w=alpha*v2.w+beta*v3.w;
#define vload(v1,addr)\
    v1 = *((float4 *)(addr));
#define vstore(addr,v1)\
    *((float4 *)(addr)) = v1;
// cache blocking version, without register-level data re-use
// with memory coelascing on shared memory
// more workloads per thread. 8x8 micro kernel.
// adopt vetorized load/store
__global__  __launch_bounds__(256)
void mysgemm_v9(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
    int lda = M, ldb = K, ldc = M;
    int tx = threadIdx.x;
    int bx = blockIdx.x, by = blockIdx.y;
    int warp_id = tx>>5;
    int lane_id = tx&31;
    int warp_row = warp_id & 3, warp_col = warp_id >> 2;
    int row_w = lane_id&3, col_w = lane_id>>2;
    int row_b = (tx&1)<<2, col_b = tx>>1;
    int lda8 = lda<<3;
    int row_c = (warp_row<<5) + (row_w<<3), col_c = (warp_col<<6) + (col_w<<3);
    int row_a = (tx&31)<<2, col_a = tx>>5;
    A = &A((bx<<7),0);
    B = &B(0,(by<<7));
    C = &C((bx<<7),(by<<7));//the TB size is 128.
    __shared__ float sa9[1024];
    __shared__ float sb9[1024];
    float4 Av1, Av2, Bv1, Bv2, Cv[16], Cres[16];
    memset(Cres, 0, sizeof(Cres));//clear registers
    for (int k_count = 0; k_count<K; k_count+=KS_9){
        /*packing A and B into shared memory*/
        vload(Av1, &A(row_a,col_a))
        vload(Bv1, &B(row_b,col_b))
        ((float4 *)sa9)[tx] = Av1;
        sb9(col_b,row_b)=Bv1.x;
        sb9(col_b,row_b+1)=Bv1.y;
        sb9(col_b,row_b+2)=Bv1.z;
        sb9(col_b,row_b+3)=Bv1.w;
        A+=lda8;B+=8;
        __syncthreads();
        #pragma unroll
        for (int inner_k_count=0;inner_k_count<KS_9;inner_k_count++){
            vload(Av1, &sa9(row_c,inner_k_count))
            vload(Av2, &sa9(row_c+4,inner_k_count))
            vload(Bv1, &sb9(col_c,inner_k_count))
            vload(Bv2, &sb9(col_c+4,inner_k_count))
            vscal(Cres[0], Av1, Bv1.x)
            vscal(Cres[1], Av2, Bv1.x)
            vscal(Cres[2], Av1, Bv1.y)
            vscal(Cres[3], Av2, Bv1.y)
            vscal(Cres[4], Av1, Bv1.z)
            vscal(Cres[5], Av2, Bv1.z)
            vscal(Cres[6], Av1, Bv1.w)
            vscal(Cres[7], Av2, Bv1.w)
            vscal(Cres[8], Av1, Bv2.x)
            vscal(Cres[9], Av2, Bv2.x)
            vscal(Cres[10], Av1, Bv2.y)
            vscal(Cres[11], Av2, Bv2.y)
            vscal(Cres[12], Av1, Bv2.z)
            vscal(Cres[13], Av2, Bv2.z)
            vscal(Cres[14], Av1, Bv2.w)
            vscal(Cres[15], Av2, Bv2.w)
        }
        __syncthreads();
    }
    vload(Cv[0], &C(row_c,col_c))
    vload(Cv[1], &C(row_c+4,col_c))
    vload(Cv[2], &C(row_c,col_c+1))
    vload(Cv[3], &C(row_c+4,col_c+1))
    vload(Cv[4], &C(row_c,col_c+2))
    vload(Cv[5], &C(row_c+4,col_c+2))
    vload(Cv[6], &C(row_c,col_c+3))
    vload(Cv[7], &C(row_c+4,col_c+3))
    vload(Cv[8], &C(row_c,col_c+4))
    vload(Cv[9], &C(row_c+4,col_c+4))
    vload(Cv[10], &C(row_c,col_c+5))
    vload(Cv[11], &C(row_c+4,col_c+5))
    vload(Cv[12], &C(row_c,col_c+6))
    vload(Cv[13], &C(row_c+4,col_c+6))
    vload(Cv[14], &C(row_c,col_c+7))
    vload(Cv[15], &C(row_c+4,col_c+7))
    
    simd_axpby(Cres[0],alpha,Cres[0],beta,Cv[0])
    simd_axpby(Cres[1],alpha,Cres[1],beta,Cv[1])
    simd_axpby(Cres[2],alpha,Cres[2],beta,Cv[2])
    simd_axpby(Cres[3],alpha,Cres[3],beta,Cv[3])

    simd_axpby(Cres[4],alpha,Cres[4],beta,Cv[4])
    simd_axpby(Cres[5],alpha,Cres[5],beta,Cv[5])
    simd_axpby(Cres[6],alpha,Cres[6],beta,Cv[6])
    simd_axpby(Cres[7],alpha,Cres[7],beta,Cv[7])

    simd_axpby(Cres[8],alpha,Cres[8],beta,Cv[8])
    simd_axpby(Cres[9],alpha,Cres[9],beta,Cv[9])
    simd_axpby(Cres[10],alpha,Cres[10],beta,Cv[10])
    simd_axpby(Cres[11],alpha,Cres[11],beta,Cv[11])

    simd_axpby(Cres[12],alpha,Cres[12],beta,Cv[12])
    simd_axpby(Cres[13],alpha,Cres[13],beta,Cv[13])
    simd_axpby(Cres[14],alpha,Cres[14],beta,Cv[14])
    simd_axpby(Cres[15],alpha,Cres[15],beta,Cv[15])

    vstore(&C(row_c,col_c), Cres[0])
    vstore(&C(row_c+4,col_c), Cres[1])
    vstore(&C(row_c,col_c+1), Cres[2])
    vstore(&C(row_c+4,col_c+1), Cres[3])
    vstore(&C(row_c,col_c+2), Cres[4])
    vstore(&C(row_c+4,col_c+2), Cres[5])
    vstore(&C(row_c,col_c+3), Cres[6])
    vstore(&C(row_c+4,col_c+3), Cres[7])
    vstore(&C(row_c,col_c+4), Cres[8])
    vstore(&C(row_c+4,col_c+4), Cres[9])
    vstore(&C(row_c,col_c+5), Cres[10])
    vstore(&C(row_c+4,col_c+5), Cres[11])
    vstore(&C(row_c,col_c+6), Cres[12])
    vstore(&C(row_c+4,col_c+6), Cres[13])
    vstore(&C(row_c,col_c+7), Cres[14])
    vstore(&C(row_c+4,col_c+7), Cres[15])
}

#include<stdio.h>
#include<stdlib.h>
#define A(i,j) A[(i) + (j)*lda]
#define B(i,j) B[(i) + (j)*ldb]
#define ptr_A(i,j) ptr_A[(i) + (j)*lda]
#define ptr_B(i,j) ptr_B[(i) + (j)*ldb]
#define C(i,j) C[(i) + (j)*ldc]
#define sa10(i,j) sa10[((j)<<7) + (i)]
#define sb10(i,j) sb10[((j)<<7) + (i)]
#define MS_10 128
#define NS_10 128
#define KS_10 8
//v1 += v2 * s3, vector scaling
#define vscal(v1, v2, s3)\
    v1.x+=v2.x*s3;\
    v1.y+=v2.y*s3;\
    v1.z+=v2.z*s3;\
    v1.w+=v2.w*s3;
//v1 = alpha * v2 + beta * v3, simd fma
#define simd_axpby(v1, alpha, v2, beta, v3)\
    v1.x=alpha*v2.x+beta*v3.x;\
    v1.y=alpha*v2.y+beta*v3.y;\
    v1.z=alpha*v2.z+beta*v3.z;\
    v1.w=alpha*v2.w+beta*v3.w;
#define vload(v1,addr)\
    v1 = *((float4 *)(addr));
#define vstore(addr,v1)\
    *((float4 *)(addr)) = v1;
// cache blocking version, without register-level data re-use
// with memory coelascing on shared memory
// more workloads per thread. 8x8 micro kernel.
// adopt vetorized load/store
__global__  __launch_bounds__(256)
void mysgemm_v10(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
    int lda = M, ldb = K, ldc = M;
    int tx = threadIdx.x;
    int bx = blockIdx.x, by = blockIdx.y;
    int warp_id = tx>>5;
    int lane_id = tx&31;
    int warp_row = warp_id & 3, warp_col = warp_id >> 2;
    int row_w = lane_id&3, col_w = lane_id>>2;
    int row_b = (tx&1)<<2, col_b = tx>>1;
    int lda8 = lda<<3;
    int row_c = (warp_row<<5) + (row_w<<3), col_c = (warp_col<<6) + (col_w<<3);
    int row_a = (tx&31)<<2, col_a = tx>>5;
    int K_upper = K>>3;
    A = &A((bx<<7),0);
    B = &B(0,(by<<7));
    C = &C((bx<<7),(by<<7));//the TB size is 128.
    __shared__ float sa10[1024];
    __shared__ float sb10[1024];
    float4 Av1[2], Av2[2], Bv1[2], Bv2[2], Cv[16], Cres[16];
    float4 pref_Av, pref_Bv;
    float* ptr_A, *ptr_B;
    memset(Cres, 0, sizeof(Cres));//clear registers
    vload(pref_Av, &A(row_a,col_a))
    vload(pref_Bv, &B(row_b,col_b))
    ((float4 *)sa10)[tx] = pref_Av;
    sb10(col_b,row_b)=pref_Bv.x;
    sb10(col_b,row_b+1)=pref_Bv.y;
    sb10(col_b,row_b+2)=pref_Bv.z;
    sb10(col_b,row_b+3)=pref_Bv.w;
    __syncthreads();
    vload(Av1[0], &sa10(row_c,0))
    vload(Av2[0], &sa10(row_c+4,0))
    vload(Bv1[0], &sb10(col_c,0))
    vload(Bv2[0], &sb10(col_c+4,0))
    for (int k_count = 0; k_count<K_upper; k_count++){
        /*packing A and B into shared memory*/
        int inc = (k_count+1)%K_upper;
        ptr_A = A + inc * lda8;
        ptr_B = B + inc * 8;
        vload(pref_Av, &ptr_A(row_a,col_a))
        vload(pref_Bv, &ptr_B(row_b,col_b))
        #pragma unroll
        for (int inner_k_count=0;inner_k_count<KS_10;inner_k_count++){
            int next_inner_k_count = (inner_k_count+1)&7;
            vload(Av1[(inner_k_count+1)&1], &sa10(row_c,next_inner_k_count))
            vload(Av2[(inner_k_count+1)&1], &sa10(row_c+4,next_inner_k_count))
            vload(Bv1[(inner_k_count+1)&1], &sb10(col_c,next_inner_k_count))
            vload(Bv2[(inner_k_count+1)&1], &sb10(col_c+4,next_inner_k_count))
            vscal(Cres[0], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].x)
            vscal(Cres[1], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].x)
            vscal(Cres[2], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].y)
            vscal(Cres[3], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].y)
            vscal(Cres[4], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].z)
            vscal(Cres[5], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].z)
            vscal(Cres[6], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].w)
            vscal(Cres[7], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].w)
            vscal(Cres[8], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].x)
            vscal(Cres[9], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].x)
            vscal(Cres[10], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].y)
            vscal(Cres[11], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].y)
            vscal(Cres[12], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].z)
            vscal(Cres[13], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].z)
            vscal(Cres[14], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].w)
            vscal(Cres[15], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].w)
        }
        __syncthreads();
        ((float4 *)sa10)[tx] = pref_Av;
        sb10(col_b,row_b)=pref_Bv.x;
        sb10(col_b,row_b+1)=pref_Bv.y;
        sb10(col_b,row_b+2)=pref_Bv.z;
        sb10(col_b,row_b+3)=pref_Bv.w;
        __syncthreads();
        vload(Av1[0], &sa10(row_c,0))
        vload(Av2[0], &sa10(row_c+4,0))
        vload(Bv1[0], &sb10(col_c,0))
        vload(Bv2[0], &sb10(col_c+4,0))
    }
    vload(Cv[0], &C(row_c,col_c))
    vload(Cv[1], &C(row_c+4,col_c))
    vload(Cv[2], &C(row_c,col_c+1))
    vload(Cv[3], &C(row_c+4,col_c+1))
    vload(Cv[4], &C(row_c,col_c+2))
    vload(Cv[5], &C(row_c+4,col_c+2))
    vload(Cv[6], &C(row_c,col_c+3))
    vload(Cv[7], &C(row_c+4,col_c+3))
    vload(Cv[8], &C(row_c,col_c+4))
    vload(Cv[9], &C(row_c+4,col_c+4))
    vload(Cv[10], &C(row_c,col_c+5))
    vload(Cv[11], &C(row_c+4,col_c+5))
    vload(Cv[12], &C(row_c,col_c+6))
    vload(Cv[13], &C(row_c+4,col_c+6))
    vload(Cv[14], &C(row_c,col_c+7))
    vload(Cv[15], &C(row_c+4,col_c+7))
    
    simd_axpby(Cres[0],alpha,Cres[0],beta,Cv[0])
    simd_axpby(Cres[1],alpha,Cres[1],beta,Cv[1])
    simd_axpby(Cres[2],alpha,Cres[2],beta,Cv[2])
    simd_axpby(Cres[3],alpha,Cres[3],beta,Cv[3])

    simd_axpby(Cres[4],alpha,Cres[4],beta,Cv[4])
    simd_axpby(Cres[5],alpha,Cres[5],beta,Cv[5])
    simd_axpby(Cres[6],alpha,Cres[6],beta,Cv[6])
    simd_axpby(Cres[7],alpha,Cres[7],beta,Cv[7])

    simd_axpby(Cres[8],alpha,Cres[8],beta,Cv[8])
    simd_axpby(Cres[9],alpha,Cres[9],beta,Cv[9])
    simd_axpby(Cres[10],alpha,Cres[10],beta,Cv[10])
    simd_axpby(Cres[11],alpha,Cres[11],beta,Cv[11])

    simd_axpby(Cres[12],alpha,Cres[12],beta,Cv[12])
    simd_axpby(Cres[13],alpha,Cres[13],beta,Cv[13])
    simd_axpby(Cres[14],alpha,Cres[14],beta,Cv[14])
    simd_axpby(Cres[15],alpha,Cres[15],beta,Cv[15])

    vstore(&C(row_c,col_c), Cres[0])
    vstore(&C(row_c+4,col_c), Cres[1])
    vstore(&C(row_c,col_c+1), Cres[2])
    vstore(&C(row_c+4,col_c+1), Cres[3])
    vstore(&C(row_c,col_c+2), Cres[4])
    vstore(&C(row_c+4,col_c+2), Cres[5])
    vstore(&C(row_c,col_c+3), Cres[6])
    vstore(&C(row_c+4,col_c+3), Cres[7])
    vstore(&C(row_c,col_c+4), Cres[8])
    vstore(&C(row_c+4,col_c+4), Cres[9])
    vstore(&C(row_c,col_c+5), Cres[10])
    vstore(&C(row_c+4,col_c+5), Cres[11])
    vstore(&C(row_c,col_c+6), Cres[12])
    vstore(&C(row_c+4,col_c+6), Cres[13])
    vstore(&C(row_c,col_c+7), Cres[14])
    vstore(&C(row_c+4,col_c+7), Cres[15])
}
#include<stdio.h>
#include<stdlib.h>
#define A(i,j) A[(i) + (j)*lda]
#define B(i,j) B[(i) + (j)*ldb]
#define ptr_A(i,j) ptr_A[(i) + (j)*lda]
#define ptr_B(i,j) ptr_B[(i) + (j)*ldb]
#define C(i,j) C[(i) + (j)*ldc]
#define sa11(i,j) sa11[((j)<<7) + (i)]
#define sb11(i,j) sb11[((j)<<7) + (i)]
#define ptr_sa11(i,j) ptr_sa11[((j)<<7) + (i)]
#define ptr_sb11(i,j) ptr_sb11[((j)<<7) + (i)]
#define MS_11 128
#define NS_11 128
#define KS_11 8
//v1 += v2 * s3, vector scaling
#define vscal(v1, v2, s3)\
    v1.x+=v2.x*s3;\
    v1.y+=v2.y*s3;\
    v1.z+=v2.z*s3;\
    v1.w+=v2.w*s3;
//v1 = alpha * v2 + beta * v3, simd fma
#define simd_axpby(v1, alpha, v2, beta, v3)\
    v1.x=alpha*v2.x+beta*v3.x;\
    v1.y=alpha*v2.y+beta*v3.y;\
    v1.z=alpha*v2.z+beta*v3.z;\
    v1.w=alpha*v2.w+beta*v3.w;
#define vload(v1,addr)\
    v1 = *((float4 *)(addr));
#define vstore(addr,v1)\
    *((float4 *)(addr)) = v1;
// cache blocking version, without register-level data re-use
// with memory coelascing on shared memory
// more workloads per thread. 8x8 micro kernel.
// adopt vetorized load/store
__global__  __launch_bounds__(256)
void mysgemm_v11(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){
    int lda = M, ldb = K, ldc = M;
    int tx = threadIdx.x;
    int bx = blockIdx.x, by = blockIdx.y;
    int warp_id = tx>>5;
    int lane_id = tx&31;
    int warp_row = warp_id & 3, warp_col = warp_id >> 2;
    int row_w = lane_id&3, col_w = lane_id>>2;
    int row_b = (tx&1)<<2, col_b = tx>>1;
    int lda8 = lda<<3;
    int row_c = (warp_row<<5) + (row_w<<3), col_c = (warp_col<<6) + (col_w<<3);
    int row_a = (tx&31)<<2, col_a = tx>>5;
    int K_upper = K>>3;
    A = &A((bx<<7),0);
    B = &B(0,(by<<7));
    C = &C((bx<<7),(by<<7));//the TB size is 128.
    __shared__ float sa11[2][1024];
    __shared__ float sb11[2][1024];
    float *ptr_sa11, *ptr_sb11;
    ptr_sa11 = (float*)sa11;
    ptr_sb11 = (float*)sb11;
    float4 Av1[2], Av2[2], Bv1[2], Bv2[2], Cv[16], Cres[16];
    float4 pref_Av, pref_Bv;
    float* ptr_A, *ptr_B;
    memset(Cres, 0, sizeof(Cres));//clear registers
    vload(pref_Av, &A(row_a,col_a))
    vload(pref_Bv, &B(row_b,col_b))
    ((float4 *)ptr_sa11)[tx] = pref_Av;
    ptr_sb11(col_b,row_b)=pref_Bv.x;
    ptr_sb11(col_b,row_b+1)=pref_Bv.y;
    ptr_sb11(col_b,row_b+2)=pref_Bv.z;
    ptr_sb11(col_b,row_b+3)=pref_Bv.w;
    __syncthreads();
    vload(Av1[0], &ptr_sa11(row_c,0))
    vload(Av2[0], &ptr_sa11(row_c+4,0))
    vload(Bv1[0], &ptr_sb11(col_c,0))
    vload(Bv2[0], &ptr_sb11(col_c+4,0))
    for (int k_count = 0; k_count<K_upper; k_count++){
        /*packing A and B into shared memory*/
        int inc = (k_count+1)%K_upper;
        int offset = ((k_count+1)&1)<<10;
        ptr_A = A + inc * lda8;
        ptr_B = B + inc * 8;
        vload(pref_Av, &ptr_A(row_a,col_a))
        vload(pref_Bv, &ptr_B(row_b,col_b))
        #pragma unroll
        for (int inner_k_count=0;inner_k_count<KS_11;inner_k_count++){
            int next_inner_k_count = (inner_k_count+1)&7;
            vload(Av1[(inner_k_count+1)&1], &ptr_sa11(row_c,next_inner_k_count))
            vload(Av2[(inner_k_count+1)&1], &ptr_sa11(row_c+4,next_inner_k_count))
            vload(Bv1[(inner_k_count+1)&1], &ptr_sb11(col_c,next_inner_k_count))
            vload(Bv2[(inner_k_count+1)&1], &ptr_sb11(col_c+4,next_inner_k_count))
            vscal(Cres[0], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].x)
            vscal(Cres[1], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].x)
            vscal(Cres[2], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].y)
            vscal(Cres[3], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].y)
            vscal(Cres[4], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].z)
            vscal(Cres[5], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].z)
            vscal(Cres[6], Av1[(inner_k_count)&1], Bv1[(inner_k_count)&1].w)
            vscal(Cres[7], Av2[(inner_k_count)&1], Bv1[(inner_k_count)&1].w)
            vscal(Cres[8], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].x)
            vscal(Cres[9], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].x)
            vscal(Cres[10], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].y)
            vscal(Cres[11], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].y)
            vscal(Cres[12], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].z)
            vscal(Cres[13], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].z)
            vscal(Cres[14], Av1[(inner_k_count)&1], Bv2[(inner_k_count)&1].w)
            vscal(Cres[15], Av2[(inner_k_count)&1], Bv2[(inner_k_count)&1].w)
        }
        ptr_sa11 = (float*)sa11 + offset;
        ptr_sb11 = (float*)sb11 + offset;
        ((float4 *)ptr_sa11)[tx] = pref_Av;
        ptr_sb11(col_b,row_b)=pref_Bv.x;
        ptr_sb11(col_b,row_b+1)=pref_Bv.y;
        ptr_sb11(col_b,row_b+2)=pref_Bv.z;
        ptr_sb11(col_b,row_b+3)=pref_Bv.w;
        __syncthreads();
        vload(Av1[0], &ptr_sa11(row_c,0))
        vload(Av2[0], &ptr_sa11(row_c+4,0))
        vload(Bv1[0], &ptr_sb11(col_c,0))
        vload(Bv2[0], &ptr_sb11(col_c+4,0))
    }
    vload(Cv[0], &C(row_c,col_c))
    vload(Cv[1], &C(row_c+4,col_c))
    vload(Cv[2], &C(row_c,col_c+1))
    vload(Cv[3], &C(row_c+4,col_c+1))
    vload(Cv[4], &C(row_c,col_c+2))
    vload(Cv[5], &C(row_c+4,col_c+2))
    vload(Cv[6], &C(row_c,col_c+3))
    vload(Cv[7], &C(row_c+4,col_c+3))
    vload(Cv[8], &C(row_c,col_c+4))
    vload(Cv[9], &C(row_c+4,col_c+4))
    vload(Cv[10], &C(row_c,col_c+5))
    vload(Cv[11], &C(row_c+4,col_c+5))
    vload(Cv[12], &C(row_c,col_c+6))
    vload(Cv[13], &C(row_c+4,col_c+6))
    vload(Cv[14], &C(row_c,col_c+7))
    vload(Cv[15], &C(row_c+4,col_c+7))
    
    simd_axpby(Cres[0],alpha,Cres[0],beta,Cv[0])
    simd_axpby(Cres[1],alpha,Cres[1],beta,Cv[1])
    simd_axpby(Cres[2],alpha,Cres[2],beta,Cv[2])
    simd_axpby(Cres[3],alpha,Cres[3],beta,Cv[3])

    simd_axpby(Cres[4],alpha,Cres[4],beta,Cv[4])
    simd_axpby(Cres[5],alpha,Cres[5],beta,Cv[5])
    simd_axpby(Cres[6],alpha,Cres[6],beta,Cv[6])
    simd_axpby(Cres[7],alpha,Cres[7],beta,Cv[7])

    simd_axpby(Cres[8],alpha,Cres[8],beta,Cv[8])
    simd_axpby(Cres[9],alpha,Cres[9],beta,Cv[9])
    simd_axpby(Cres[10],alpha,Cres[10],beta,Cv[10])
    simd_axpby(Cres[11],alpha,Cres[11],beta,Cv[11])

    simd_axpby(Cres[12],alpha,Cres[12],beta,Cv[12])
    simd_axpby(Cres[13],alpha,Cres[13],beta,Cv[13])
    simd_axpby(Cres[14],alpha,Cres[14],beta,Cv[14])
    simd_axpby(Cres[15],alpha,Cres[15],beta,Cv[15])

    vstore(&C(row_c,col_c), Cres[0])
    vstore(&C(row_c+4,col_c), Cres[1])
    vstore(&C(row_c,col_c+1), Cres[2])
    vstore(&C(row_c+4,col_c+1), Cres[3])
    vstore(&C(row_c,col_c+2), Cres[4])
    vstore(&C(row_c+4,col_c+2), Cres[5])
    vstore(&C(row_c,col_c+3), Cres[6])
    vstore(&C(row_c+4,col_c+3), Cres[7])
    vstore(&C(row_c,col_c+4), Cres[8])
    vstore(&C(row_c+4,col_c+4), Cres[9])
    vstore(&C(row_c,col_c+5), Cres[10])
    vstore(&C(row_c+4,col_c+5), Cres[11])
    vstore(&C(row_c,col_c+6), Cres[12])
    vstore(&C(row_c+4,col_c+6), Cres[13])
    vstore(&C(row_c,col_c+7), Cres[14])
    vstore(&C(row_c+4,col_c+7), Cres[15])
}