__global__ voidvector_add(constfloat* A, constfloat* B, float* C, int N){ int t = blockDim.x * blockIdx.x + threadIdx.x; if(t < N) C[t] = A[t] + B[t]; } // A, B, C are device pointers (i.e. pointers to memory on the GPU) extern"C"voidsolve(constfloat* A, constfloat* B, float* C, int N){ int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
#include<cuda_runtime.h> __global__ voidmatrix_multiplication_kernel(constfloat* A, constfloat* B, float* C, int M, int N,int K){ int row = blockDim.x * blockIdx.x + threadIdx.x; int col = blockDim.y * blockIdx.y + threadIdx.y; if(col < M && row < K){//判断范围之内 float tmp = 0; for(int i = 0; i < N; i++){ tmp += A[col * N + i] * B[i * K + row]; } C[col * K + row] = tmp; } } // A, B, C are device pointers (i.e. pointers to memory on the GPU) extern"C"voidsolve(constfloat* A, constfloat* B, float* C, int M, int N, int K){ dim3 threadsPerBlock(16, 16); dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x, (M + threadsPerBlock.y - 1) / threadsPerBlock.y); matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K); cudaDeviceSynchronize(); }
__global__ voidmatrix_multiplication_kernel(constfloat* A, constfloat* B, float* C, int M, int N, int K){ int col = blockDim.x * blockIdx.x + threadIdx.x;//需要注意block的x是col,y是row int row = blockDim.y * blockIdx.y + threadIdx.y; int pc = threadIdx.x; //col int pr = threadIdx.y; //row int num_tiles = (N + blockDim.x - 1) / blockDim.x; __shared__ float As[BL][BL]; __shared__ float Bs[BL][BL];
float sum = 0.0; for(int i=0;i<num_tiles;i++){ int colA = i*blockDim.x+threadIdx.x; int rowB = i*blockDim.y+threadIdx.y; if(row<M&&colA<N){ As[pr][pc] = A[row*N+colA]; } else As[pr][pc] = 0; if(rowB<N&&col<K){ Bs[pr][pc] = B[rowB*K+col]; } else Bs[pr][pc] = 0; __syncthreads(); for(int j=0;j<BL;j++){ sum += As[pr][j] * Bs[j][pc]; } __syncthreads();
// input, output are device pointers (i.e. pointers to memory on the GPU) extern"C"voidsolve(constfloat* input, float* output, int N){ softmax_kernel<<<1,BLOCKSIZE>>>(input, output, N); cudaDeviceSynchronize(); }
__global__ voidkernel_convolution(constfloat* input, constfloat* kernel, float* output, int input_rows, int input_cols, int kernel_rows, int kernel_cols){ int y = blockDim.y*blockIdx.y + threadIdx.y; int x = blockDim.x*blockIdx.x + threadIdx.x; if(x>=input_cols-kernel_cols+1 || y>=input_rows-kernel_rows+1)return; float sum = 0.0f; for(int i=0;i<kernel_rows;i++) for(int j=0;j<kernel_cols;j++){ sum+= input[(y+i)*input_cols+x+j]*kernel[i*kernel_cols+j]; } output[y*(input_cols-kernel_cols+1)+x] = sum; }
// input, kernel, output are device pointers extern"C"voidsolve(constfloat* input, constfloat* kernel, float* output, int input_rows, int input_cols, int kernel_rows, int kernel_cols){ dim3 block(16,16); dim3 grid((input_cols-kernel_cols+16)/16,(input_rows-kernel_rows+16)/16); kernel_convolution<<<grid,block>>>(input,kernel,output,input_rows,input_cols,kernel_rows,kernel_cols); }
Top K Selection
一个做法是每个线程暴力维护自己的最大,然后再去合并每个线程的答案
1 2 3 4 5 6 7 8 9 10 11 12
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { if (tid < offset) { for (int j = 0; j < k; j++) { insert_value(local, shared[(tid + offset) * k + j], k); }