Stargazer' blog

渴望驻足的时光转瞬即逝
希求解脱的日夜寸阴若岁

0%

leetgpu

记录一下刷leetgpu的学习,cuda入门

Vector Addition

没什么好说的,注意$tid<n$ 就行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#include <cuda_runtime.h>

__global__ void vector_add(const float* A, const float* B, float* C, int N) {
int t = blockDim.x * blockIdx.x + threadIdx.x;
if(t < N)
C[t] = A[t] + B[t];
}
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
cudaDeviceSynchronize();
}

Matrix Multiplication

$MNK$,最基础的矩乘很好写,对于二维block的每个thread算对应的每个点值就行,枚举 $n$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include <cuda_runtime.h>
__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N,int K) {
int row = blockDim.x * blockIdx.x + threadIdx.x;
int col = blockDim.y * blockIdx.y + threadIdx.y;
if(col < M && row < K){//判断范围之内
float tmp = 0;
for(int i = 0; i < N; i++){
tmp += A[col * N + i] * B[i * K + row];
}
C[col * K + row] = tmp;
}
}
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K) {
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
(M + threadsPerBlock.y - 1) / threadsPerBlock.y);
matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);
cudaDeviceSynchronize();
}

更近一步的tile分块,每次算C的一个$K*K$的tile

注意坐标计算,以及row和col,A[M,N],B[N,K],C[M,K]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
const int BL = 16;

__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N,
int K) {
int col = blockDim.x * blockIdx.x + threadIdx.x;//需要注意block的x是col,y是row
int row = blockDim.y * blockIdx.y + threadIdx.y;
int pc = threadIdx.x; //col
int pr = threadIdx.y; //row
int num_tiles = (N + blockDim.x - 1) / blockDim.x;
__shared__ float As[BL][BL];
__shared__ float Bs[BL][BL];

float sum = 0.0;
for(int i=0;i<num_tiles;i++){
int colA = i*blockDim.x+threadIdx.x;
int rowB = i*blockDim.y+threadIdx.y;
if(row<M&&colA<N){
As[pr][pc] = A[row*N+colA];
}
else As[pr][pc] = 0;
if(rowB<N&&col<K){
Bs[pr][pc] = B[rowB*K+col];
}
else Bs[pr][pc] = 0;
__syncthreads();
for(int j=0;j<BL;j++){
sum += As[pr][j] * Bs[j][pc];
}
__syncthreads();

}
if(row<M&&col<K){//注意判断范围之内
C[row*K+col] = sum;
}
}

Reduction

__shfl_down_sync(mask,val,offset) 可以对于一个warp里的线程同步计算,对于mask这些线程,返回+offset编号的线程的value,没有这个线程则忽略,up是前面的,xor是异或offeset的

一个经典写法求warp的value之和就是

1
2
3
for(int i = warpsize>>1;i>0;i>>=1){
value += __shfl_down_sync(0xffffffff,value,i);
}

这样$laneid=0$的线程就有线程之和了,如果用$xor$的话就是每个线程都有所有值之和,不过需要注意一下mask,也可以先down然后广播

1
val = __shfl_sync(0xffffffff, val, 0);

然后这里就有多种写法了,可以开一个block,然后每个线程先求 $laneid+k*stripe$ 的和,然后再对线程求和(可以看后面的softmax)。

另外就是正常开,然后对于一个block里面线程先求和,然后block可以利用 __syncthreads求和,然后每个block用atomicAdd来做(因为block间没法同步)(warp同步可以__syncwarp(mask))。

然后一个block里面的求和就是先每个warp分别求和,然后每个warp同步到shared的warpid,然后再用warpnum个线程再做,这个次数就取决于blocksize了,直到只需要一个warp为止。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <cuda_runtime.h>
#define warpsize 32
#define blocksize 1024
__global__ void cuda_reduction(const float* A, float* B,const int N){
int tid = blockDim.x*blockIdx.x+threadIdx.x;
float value = 0;
if(tid < N) value = A[tid];
int warpid = threadIdx.x / warpsize;
int laneid = threadIdx.x % warpsize;
const int warpnum = blocksize / warpsize;
__shared__ float sum[warpnum];

for(int i = warpsize>>1;i>0;i>>=1){
value += __shfl_down_sync(0xffffffff,value,i);
}
if(laneid == 0){
sum[warpid] = value;
}
__syncthreads();
if(threadIdx.x < warpsize){
if(threadIdx.x < warpnum){
value = sum[threadIdx.x];
}
else value = 0;
for(int i = warpsize>>1;i>0;i>>=1){
value += __shfl_down_sync(0xffffffff,value,i);
}
if(threadIdx.x == 0){
atomicAdd(B,value);
}
}
}

// input, output are device pointers
extern "C" void solve(const float* input, float* output, int N) {
int threadPerBlock = blocksize;
int blockPerGrid = (N+blocksize - 1)/blocksize;
cuda_reduction<<<blockPerGrid,threadPerBlock>>>(input,output,N);
}

Softmax

$\sigma(x)_i = \frac{e^{x_i}-max}{\sum_{j=1}^{n} e^{x_j-max}}$

$-max$ 是为了避免指数爆炸,所以需要三步,求max,求指数之和,每个位置求值。

这里是个不太优美的代码,做两次warp的max和求和,求和之后还要广播给所有的线程,所以只能一个block来写。block之间的话好像没有atomicMax,只能CAS,太离谱了就。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include <cuda_runtime.h>

#define BLOCKSIZE 1024
#define WARPSIZE 32

__device__ float warp_max(float val){
for(int i = WARPSIZE>>1;i>0;i>>=1){
val = fmaxf(val,__shfl_xor_sync(0xffffffff,val,i));
}
return val;
}
__device__ float warp_sum(float val){
for(int i = WARPSIZE>>1;i>0;i>>=1){
val += __shfl_xor_sync(0xffffffff,val,i);
}
return val;
}
__global__ void softmax_kernel(const float* input, float* output, int N) {
int tid = threadIdx.x;
int laneId = threadIdx.x%32;
int warpId = threadIdx.x/32;
const int warpNum = BLOCKSIZE / WARPSIZE; // ensure
__shared__ float tmp[warpNum];
__shared__ float tsum[warpNum];

float mx = -1e18;
for(int x = tid; x < N; x+=blockDim.x){
mx = fmaxf(mx,input[x]);
}
mx = warp_max(mx);
if(laneId == 0)
tmp[warpId] = mx;

__syncthreads();

if(tid < warpNum){
mx = tmp[tid];
}
else mx = 0.0f;
mx = warp_max(mx);

if(laneId == 0 && warpId == 0){
tmp[0] = mx;
}/*这里广播给block的所有线程相当于。
我还看到另外一种写法是
const float warp_sum_val = warp_sum<WARP_SIZE>(val);
if (lane_id == 0) {
shared_mem[warp_id] = warp_sum_val;
}
__syncthreads();
const float thread_val = (lane_id < NUM_WARPS) ? shared_mem[lane_id] : 0.0f;
const float total_sum = warp_sum<WARP_SIZE>(thread_val);
相当于每个warp的前warpnum个线程都去拿block的sm里面的每个线程之和,这样每个warp都算了一次答案。
*/
__syncthreads();

mx = tmp[0];

float sum = 0;
for(int x = tid; x < N; x+=blockDim.x){
sum += __expf(input[x] - mx);
}
sum = warp_sum(sum);
if(laneId == 0)
tsum[warpId] = sum;

__syncthreads();

if(tid < warpNum){
sum = tsum[tid];
}
else sum = 0.0f;
float invs = 1.0 / warp_sum(sum);
if(laneId == 0 && warpId == 0){
tsum[0] = invs;
}
__syncthreads();
invs = tsum[0];
for(int x = tid; x < N; x+=blockDim.x){
output[x] = __expf(input[x] - mx) * invs;
}
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int N) {
softmax_kernel<<<1,BLOCKSIZE>>>(input, output, N);
cudaDeviceSynchronize();
}

Softmax Attention

三个步骤,Matmul,softmax和matmul

一个先需要注意的是合法性的判断,然后这里的代码block求和用的和之前一次广播不一样的另外的写法。补充一下理解就是一行必须要一起求和,所以只能一个block来做,1024/512最大的问题应该是一个SM执行的block数量。最后记得给QK free掉,记得回去学一下freemalloc这些,又有点忘了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#include <cuda_runtime.h>
#include <float.h>
#define warpSize 32
#define softmax_blockSize 1024

__global__ void kernel_matrixMultiplicationT(const float* Q, const float* K, float *Ans,int M,int N,int d){
float inv_sqrtd = 1.0/sqrtf(d);
int col = blockDim.x*blockIdx.x + threadIdx.x;
int row = blockDim.y*blockIdx.y + threadIdx.y;
if(row<M&&col<N){
float tmp = 0.0f;
for(int i=0;i<d;i++){
tmp += Q[row*d+i]*K[col*d+i];
}
Ans[row*N+col] = tmp*inv_sqrtd;
}
}

__device__ float warp_max(float val){
for(int i = warpSize>>1;i>0;i>>=1){
val = fmax(val,__shfl_xor_sync(0xffffffff,val,i));
}
return val;
}
__device__ float warp_sum(float val){
for(int i = warpSize>>1;i>0;i>>=1){
val += __shfl_xor_sync(0xffffffff,val,i);
}
return val;
}
__device__ float block_max(float max_val){
const int warpNum = softmax_blockSize /warpSize;
__shared__ float warpmax[warpNum];
int tid = threadIdx.x;
int laneid = tid % 32;
int warpid = tid / 32;
max_val = warp_max(max_val);
if(laneid == 0)
warpmax[warpid] = max_val;
__syncthreads();
if(laneid < warpNum)
max_val = warpmax[laneid];
else max_val = -FLT_MAX;
max_val = warp_max(max_val);
return max_val;
}

__device__ float block_sum(float sum){
const int warpNum = softmax_blockSize /warpSize;
__shared__ float warpsum[warpNum];
int tid = threadIdx.x;
int laneid = tid % 32;
int warpid = tid / 32;
sum = warp_sum(sum);
if(laneid == 0)
warpsum[warpid] = sum;
__syncthreads();
if(laneid < warpNum)
sum = warpsum[laneid];
else sum = 0.0f;
sum = warp_sum(sum);
return sum;
}

__global__ void kernel_softmax(float *V,int M,int N){
int row = blockIdx.x;
int tid = threadIdx.x;
float max_val = -FLT_MAX;
for(int i=tid;i<N;i+=blockDim.x){
max_val = fmaxf(max_val,V[row*N+i]);
}
max_val = block_max(max_val);

float sum = 0.0f;

for(int i=tid;i<N;i+=blockDim.x){
sum += expf(V[row*N+i]-max_val);
}
float inv = 1.0/block_sum(sum);

for(int i=tid;i<N;i+=blockDim.x){
V[row*N+i] = expf(V[row*N+i]-max_val) * inv;
}
}

__global__ void kernel_matrixMultiplication(const float* Q, const float* K, float *Ans,int M,int N,int d){
int col = blockDim.x*blockIdx.x + threadIdx.x;
int row = blockDim.y*blockIdx.y + threadIdx.y;
float tmp = 0.0f;
if(row<M&&col<d){
for(int i=0;i<N;i++){
tmp += Q[row*N+i]*K[i*d+col];
}
Ans[row*d+col] = tmp;
}
}

// Q, K, V, output are device pointers
extern "C" void solve(const float* Q, const float* K, const float* V, float* output, int M, int N,
int d) {
//QK
float *QK = nullptr;
{
dim3 threadsPerBlock(16,16);
dim3 blocksPerGrid((N+threadsPerBlock.x-1)/threadsPerBlock.x,(M+threadsPerBlock.y-1)/threadsPerBlock.y);
cudaMalloc(&QK,sizeof(float)*(M*N));
kernel_matrixMultiplicationT<<<blocksPerGrid,threadsPerBlock>>>(Q,K,QK,M,N,d);
}
//QK : M*N
{
int threadsPerBlock = 1024;
int blocksPerGrid = M;
kernel_softmax<<<blocksPerGrid,threadsPerBlock>>>(QK,M,N);
}
//QK*V:M*N*d
{
dim3 threadsPerBlock(16,16);
dim3 blocksPerGrid((d+threadsPerBlock.x-1)/threadsPerBlock.x,(M+threadsPerBlock.y-1)/threadsPerBlock.y);
kernel_matrixMultiplication<<<blocksPerGrid,threadsPerBlock>>>(QK,V,output,M,N,d);
}
cudaFree(QK);
}

2D Convolution

这倒是没什么说的,暴力写法很简单,下一步优化思路一个是每个block先开个shared存input然后算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include <cuda_runtime.h>

__global__ void kernel_convolution(const float* input, const float* kernel, float* output, int input_rows,
int input_cols, int kernel_rows, int kernel_cols){
int y = blockDim.y*blockIdx.y + threadIdx.y;
int x = blockDim.x*blockIdx.x + threadIdx.x;

if(x>=input_cols-kernel_cols+1 || y>=input_rows-kernel_rows+1)return;
float sum = 0.0f;
for(int i=0;i<kernel_rows;i++)
for(int j=0;j<kernel_cols;j++){
sum+= input[(y+i)*input_cols+x+j]*kernel[i*kernel_cols+j];
}
output[y*(input_cols-kernel_cols+1)+x] = sum;
}

// input, kernel, output are device pointers
extern "C" void solve(const float* input, const float* kernel, float* output, int input_rows,
int input_cols, int kernel_rows, int kernel_cols) {
dim3 block(16,16);
dim3 grid((input_cols-kernel_cols+16)/16,(input_rows-kernel_rows+16)/16);
kernel_convolution<<<grid,block>>>(input,kernel,output,input_rows,input_cols,kernel_rows,kernel_cols);
}

Top K Selection

一个做法是每个线程暴力维护自己的最大,然后再去合并每个线程的答案

1
2
3
4
5
6
7
8
9
10
11
12
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if (tid < offset) {
for (int j = 0; j < k; j++) {
insert_value(local, shared[(tid + offset) * k + j], k);
}

for (int j = 0; j < k; j++) {
shared[tid * k + j] = local[j];
}
}
__syncthreads();
}

这种写法

另外一个就是 Bitonic Sort。

具体来讲枚举size不断*2,把连续size个元素交替的正逆序排好序。

每次size个的合并是先把i和i+size/2比较,较大的放一侧,较小放一侧,然后递归更小的继续做。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#include <cuda_runtime.h>
#include <float.h>
__global__ void bitonic_sort(float *d,int stride,int size,int M){
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int nxt=idx^stride;
if(nxt<idx||nxt>=M||idx>=M)return;
int is_desc=((idx&size) == 0);
if((is_desc&&d[idx]<d[nxt])||(!is_desc&&d[idx]>d[nxt])){
float tmp = d[idx];
d[idx]=d[nxt];
d[nxt]=tmp;
}
}

__global__ void init_padding(float *input,int N,int M){
int idx = blockDim.x*blockIdx.x+threadIdx.x;
if(idx+N<M){
input[idx+N] = -FLT_MAX;
}
}

// input, output are device pointers
extern "C" void solve(const float* input, float* output, int N, int k) {
int M=1;
while(M<N)M<<=1;
float *d;
cudaMalloc(&d,sizeof(float)*M);
cudaMemcpy(d,input,sizeof(float)*N,cudaMemcpyDeviceToDevice);
int block =256;
if(M>N){
int grid = (M-N+block-1)/block;
init_padding<<<grid,block>>>(d,N,M);
}
int grid = (M+block-1)/block;
for(int size=2;size<=M;size<<=1){
for(int stride=size/2;stride>0;stride>>=1){
bitonic_sort<<<grid,block>>>(d,stride,size,M);
}
}
cudaMemcpy(output,d,sizeof(float)*k,cudaMemcpyDeviceToDevice);
cudaFree(d);
}