cuda编程基础:矩阵加减、点乘
cuda编程基础:矩阵加减、点乘矩阵加减Device 单个block例子Device 多个block例子Matrix Multiply矩阵相乘HostDevice 单个block例子Device 多个block例子矩阵加减Device 单个block例子每个线程块block中,线程数量有上限(eg.512/1024)// kernel definition__global__ void MatAd
cuda编程基础:矩阵加减、点乘
矩阵加减
Device 单个block例子
每个线程块block中,线程数量有上限(eg.512/1024)
// kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = threadIdx.x;
int j = threadIdx.y;
C[i][j] = A[i][j] + B[i][j] // 线程并行计算
}
int main()
{
...
// kernel invocation with one block of N * N * 1 threads
int numBlocks = 1;
dim3 threadPerBlock(N, N); // 2D block
MatAdd<<<numBlocks, threadPerBlock>>>(A, B, C);
}
Device 多个block例子
块索引:blockIdx、维度:blockDim
// kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i < N && j < N)
C[i][j] = A[i][j] + B[i][j] // 线程并行计算
}
int main()
{
...
// kernel invocation
dim3 threadPerBlock(16, 16); // 2D block
dim3 numBlocks(N / threadPerBlock.x, N / threadPerBlock.y);
MatAdd<<<numBlocks, threadPerBlock>>>(A, B, C);
return 0;
}
cudaMalloc()/cudaFree()
float *Md;
int size = Width * Width * sizeof(float);
cudaMalloc((void **)&Md, size);
cudaFree(Md);
Matrix Multiply矩阵相乘
Host
void MatrixMultiOnHost(float * M, float * N, float * P, int width)
{
for(int i = 0; i< width; i++)
{
for(int j = 0;j < width; j++)
{
float sum = 0;
for(int k = 0; k < width; k++)
{
float a = M[i * width + k];
float b = N[k * width + j];
sum += a *b;
}
P[i * width + j] = sum;
}
}
}
Device 单个block例子
单个block的例子,可以简单的理解为Pd矩阵行列索引 (i, j) 被线程 (ty, tx)代替了,也就是在CPU上的i,j两个循环被删除了
__global__ void MatrixMultiKernel(float* Md, float* Nd, float* Pd, int Width)
{
// 2d block: 2d thread id
int tx = threadIdx.x;
int ty = threadIdx.y;
// Pvalue 用来存储线程计算Pd中每个元素的值
float Pvalue = 0;
for(int k = 0; k < Width; k++)
{
float Mdelement = Md[ty * Md.width + k];
float Ndelement = Nd[k * Nd.width + tx];
Pvalue += Mdelement * Ndelement;
}
Pd[ty * Width + tx] = Pvalue;
}
上面的例子简单粗暴,但是也带来一些问题,如下:
矩阵的长度限制,不可能无限大,比较每个block的线程数量有限
eg. G80、GT200最多512个线程/block
并且存在过多的global memory的读写访问,会大大拉低运行效率
Device 多个block例子
去除长度的限制,将Pd矩阵拆成很多个小块,即tile块;把一个tile布置到一个block中;通过threadIdx和blockIdx索引
__global__ void MatrixMultiKernel(float* Md, float* Nd, float* Pd, int Width)
{
int Col = blockIdx.x * blockDim.x + threadIdx.x;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
float Pvalue = 0;
for(int k = 0; k < Width; k++)
{
float Mdelement = Md[Row * Md.width + k];
float Ndelement = Nd[k * Nd.width + Col];
Pvalue += Mdelement * Ndelement;
}
Pd[Row * Width + Col] = Pvalue;
}
int main()
{
// TILE_WIDTH为每个tille的大小
...
dim3 dimGrid(Width / TILE_WIDTH, Height / TILE_WIDTH); // 2D block
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
MatrixMultiKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, TILE_WIDTH);
return 0;
}
上面的例子优化的线程长度问题,但是核函数的效率依然受限与global memory的带宽
在每个块内部使用shared memory来减少global memory带宽的需求;
即将kernel拆分为多个阶段:每个阶段使用Md和Nd的子集累加Pd,这样每个阶段具有很好的数据局部性能;
__global__ void MatrixMultiKernel(float* Md, float* Nd, float* Pd, int Width)
{
__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int Col = bx * TILE_WIDTH + tx;
int Row = by * TILE_WIDTH + ty;
float Pvalue = 0;
for(int m = 0; m < Width / TILE_WIDTH; m++)
{
Mds[ty][tx] = Md[Row * Width + (m * TILE_WIDTH + tx)];
Nds[ty][tx] = Nd[(m * TILE_WIDTH + ty) + Col];
__syncthreads();
for(int k = 0; k < TILE_WIDTH; k++)
Pvalue += Mds[ty][k] * Nds[k][tx];
__syncthreads();
}
Pd[Row * Width + Col] = Pvalue;
}

欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。
更多推荐
所有评论(0)