矩阵加减

Device 单个block例子

每个线程块block中,线程数量有上限(eg.512/1024)

// kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
	int i = threadIdx.x;
	int j = threadIdx.y;
	C[i][j] = A[i][j] + B[i][j] // 线程并行计算
}

int main()
{
	...
	// kernel invocation with one block of N * N * 1 threads
	int numBlocks = 1;
	dim3 threadPerBlock(N, N); // 2D block
	MatAdd<<<numBlocks, threadPerBlock>>>(A, B, C);
}

Device 多个block例子

块索引:blockIdx、维度:blockDim

// kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;
	if(i < N && j < N)
		C[i][j] = A[i][j] + B[i][j] // 线程并行计算
}

int main()
{
	...
	// kernel invocation
	dim3 threadPerBlock(16, 16); // 2D block
	dim3 numBlocks(N / threadPerBlock.x, N / threadPerBlock.y);
	MatAdd<<<numBlocks, threadPerBlock>>>(A, B, C);
	return 0;
}

cudaMalloc()/cudaFree()

float *Md;
int size = Width * Width * sizeof(float);

cudaMalloc((void **)&Md, size);
cudaFree(Md);

Matrix Multiply矩阵相乘

Host

void MatrixMultiOnHost(float * M, float * N, float * P, int width)
{
	for(int i = 0; i< width; i++)
	{
		for(int j = 0;j < width; j++)
		{
			float sum = 0;
			for(int k = 0; k < width; k++)
			{
				float a = M[i * width + k];
				float b = N[k * width + j];
				sum += a *b;
			}
			P[i * width + j] = sum;
		}
	}
}

Device 单个block例子

单个block的例子,可以简单的理解为Pd矩阵行列索引 (i, j) 被线程 (ty, tx)代替了,也就是在CPU上的i,j两个循环被删除了

__global__ void MatrixMultiKernel(float* Md, float* Nd, float* Pd, int Width)
{
	// 2d block: 2d thread id
	int tx = threadIdx.x;
	int ty = threadIdx.y;
	// Pvalue 用来存储线程计算Pd中每个元素的值
	float Pvalue = 0;
	for(int k = 0; k < Width; k++)
	{
		float Mdelement = Md[ty * Md.width + k];
		float Ndelement = Nd[k * Nd.width + tx];
		Pvalue += Mdelement * Ndelement;
	} 
	Pd[ty * Width + tx] = Pvalue;
}

上面的例子简单粗暴,但是也带来一些问题,如下:

矩阵的长度限制,不可能无限大,比较每个block的线程数量有限
eg. G80、GT200最多512个线程/block

并且存在过多的global memory的读写访问,会大大拉低运行效率

Device 多个block例子

去除长度的限制,将Pd矩阵拆成很多个小块,即tile块;把一个tile布置到一个block中;通过threadIdx和blockIdx索引

__global__ void MatrixMultiKernel(float* Md, float* Nd, float* Pd, int Width)
{
	int Col = blockIdx.x * blockDim.x + threadIdx.x;
	int Row = blockIdx.y * blockDim.y + threadIdx.y;
	
	float Pvalue = 0;
	for(int k = 0; k < Width; k++)
	{
		float Mdelement = Md[Row * Md.width + k];
		float Ndelement = Nd[k * Nd.width + Col];
		Pvalue += Mdelement * Ndelement;
	} 
	Pd[Row * Width + Col] = Pvalue;
}

int main()
{
    // TILE_WIDTH为每个tille的大小
	...
	dim3 dimGrid(Width / TILE_WIDTH, Height / TILE_WIDTH); // 2D block
	dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
	MatrixMultiKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, TILE_WIDTH);
	return 0;
}

上面的例子优化的线程长度问题,但是核函数的效率依然受限与global memory的带宽

在每个块内部使用shared memory来减少global memory带宽的需求;
即将kernel拆分为多个阶段:每个阶段使用Md和Nd的子集累加Pd,这样每个阶段具有很好的数据局部性能;

__global__ void MatrixMultiKernel(float* Md, float* Nd, float* Pd, int Width)
{
	__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
	__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];

	int bx = blockIdx.x;
	int by = blockIdx.y;
	int tx = threadIdx.x;
	int ty = threadIdx.y;
	
	int Col = bx * TILE_WIDTH + tx;
	int Row = by * TILE_WIDTH + ty;
	
	float Pvalue = 0;
	for(int m = 0; m < Width / TILE_WIDTH; m++)
	{
		Mds[ty][tx] = Md[Row * Width + (m * TILE_WIDTH + tx)];
		Nds[ty][tx] = Nd[(m * TILE_WIDTH + ty) + Col];
		__syncthreads();
		
		for(int k = 0; k < TILE_WIDTH; k++)
			Pvalue += Mds[ty][k] * Nds[k][tx]__syncthreads();
	} 
	Pd[Row * Width + Col] = Pvalue;
}

Logo

欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。

更多推荐