record------CUDA GPU并行编程理解并实现centernet batch后处理
CUDA GPU并行编程理解并实现centernet batch后处理CUDA GPU并行编程前言一维CUDA核函数下标索引获取方法二维CUDA核函数下标索引获取方法CUDA GPU并行编程前言理解基本单元:线程(Thread)、线程块(Block)、网格(Grid),线程块数(BlockDim)、网格数(GridDim)其中网格(Grid)是由多个线程块(Block)组成,线程块(Block)是
CUDA GPU并行编程理解并实现centernet batch后处理
CUDA GPU并行编程
前言
理解基本单元:线程(Thread)、线程块(Block)、网格(Grid),线程块数(BlockDim)、网格数(GridDim)
其中网格(Grid)是由多个线程块(Block)组成,线程块(Block)是由多个线程(Thread)组成。
BlockDim:一个Block块中含有的最大线程(Thread)数,和硬件有关系,是一个dim3类型
GridDim:一个Grid块中含有的最大Block块数目,和硬件有关系,是一个dim3类型
ThreadIdx:线程的索引号,是相对当前所在的一个block中的位置,是一个uint3类型
BlockIdx:线程块的索引号,是相对当前所在的一个grid中的位置,是一个uint3类型
GridIdx:没有这个东西,会报错,就是只能设置最大一个网格 的核,在1080ti上
Dim3 数据类型是CUDA自定义的一种数据结构,一般为三维的无符号整型,其定义及引用方式如下:
Dim3 BlockSize(16, 16, 1)
// BlockSize.x = 16; BlockSize.y = 16; BlockSize.z = 1;
// 也可以写为 Dim3 BlockSize(16, 16),最后一维默认为1;
以下介绍如何在CUDA中使用一维、二维核函数,并索引其下标。
一维CUDA核函数下标索引获取方法
threadIdx.x : 代表在当前block内 x方向上的线程序号;
blockIdx.x : 代表在当前grid内 x方向上的线程块序号;
blockDim.x : 代表一个block中x方向上最大的线程数;
gridDim.x : 代表一个grid中x方向最大的block数;
所以要索引第三个Block(序号2)中第4个线程(序号3)的下标index,其过程如图所示:
二维CUDA核函数下标索引获取方法
threadIdx.y : 代表在当前block内 y方向上的线程序号;
blockIdx.y : 代表在当前grid内 y方向上的线程块序号;
blockDim.y : 代表一个block中 y方向上最大的线程数;
gridDim.y : 代表一个grid中 y方向最大的block数;
图示理解:
设置线程维数
在.cu文件中调用自定义的内核函数时,需要使用三角括号语法<<<paramA,paramB,*args >>>指定CUDA内核启动,其中参数paramA, paramB 必须要给定,参数详解:
paramA: 设置gridDim,即一个grid中每个维度block的数量,使用 Dim3 定义
paramA: 设置blockDim,即一个block中每个维度thread的数量,使用 Dim3 定义
centernet cuda后处理
膜拜大佬代码(码云:https://gitee.com/zjkclpch/TensorRT-CenterNet?_from=gitee_search) (github:暂时略,两会期间被禁),不才将大佬的单张后处理 修改为 batch处理。
ctdetNet.cpp文件
修改函数: void ctdetNet::InitEngine() 为:
void ctdetNet::InitEngine() {
const int maxBatchSize = 1; //需要修改为自己使用的batch size
mContext = mEngine->createExecutionContext();
assert(mContext != nullptr);
mContext->setProfiler(&mProfiler);
int nbBindings = mEngine->getNbBindings();//nbBindings=4
if (nbBindings > 4) forwardFace= true;
mCudaBuffers.resize(nbBindings); //0:输入 1-3:输出
mBindBufferSizes.resize(nbBindings);
// cudaOutputBuffer.resize(maxBatchSize);
int64_t totalSize = 0;
for (int i = 0; i < nbBindings; ++i)
{
nvinfer1::Dims dims = mEngine->getBindingDimensions(i); //pth->onnx时设置n>1,此处的 dims.d[0]=n
nvinfer1::DataType dtype = mEngine->getBindingDataType(i);
dims_type.push_back({dims,dtype});
totalSize = volume(dims) * maxBatchSize * getElementSize(dtype);
mBindBufferSizes[i] = totalSize/maxBatchSize; //必须进行出掉batch
mCudaBuffers[i] = safeCudaMalloc(totalSize/maxBatchSize);
}
// outputBufferSize = mBindBufferSizes[1] * 6/maxBatchSize ; //heatmap上每个热点对应6个值:x,y,w,h,reg_x,reg_y
outputBufferSize = mBindBufferSizes[1] * 6; //heatmap上每个热点对应6个值:x,y,w,h,reg_x,reg_y
// for(int i=0;i<maxBatchSize;i++)
// {
// cudaOutputBuffer[i]=safeCudaMalloc(outputBufferSize);//每张图片的输出地址
// }
cudaOutputBuffer=safeCudaMalloc(outputBufferSize);//每张图片的输出地址
CUDA_CHECK(cudaStreamCreate(&mCudaStream));
}
增加 void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b):
void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b)
{
int inputIndex = 0 ;
int batch_size=1;
struct timeval start0, end0;
gettimeofday(&start0,NULL);
CUDA_CHECK(cudaMemcpyAsync(mCudaBuffers[inputIndex], inputData, mBindBufferSizes[inputIndex],
cudaMemcpyHostToDevice, mCudaStream));
cudaDeviceSynchronize();
gettimeofday(&end0,NULL);
int timeuse0 = static_cast<int>(1000000 * (end0.tv_sec - start0.tv_sec ) + end0.tv_usec - start0.tv_usec);
float time0 = float(timeuse0)/1000;
std::cout << "batch cpDevice is: " << time0 << std::endl;
struct timeval start, end;
gettimeofday(&start,NULL);
mContext->execute(batch_size, &mCudaBuffers[inputIndex]); //出现问题,不支持批处理
cudaDeviceSynchronize();
gettimeofday(&end,NULL);
int timeuse = static_cast<int>(1000000 * (end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec);
float time = float(timeuse)/1000;
std::cout << "batch forward is: " << time << std::endl;
// *time_consum = *time_consum + time;
CUDA_CHECK(cudaMemset(cudaOutputBuffer, 0, volume(dims_type[1].first)*6*sizeof(float)));
struct timeval start1, end1;
gettimeofday(&start1,NULL);
// PrintResult(mCudaStream[0])
int step=int(volume(dims_type[1].first)/batch_size*6); //float型的地址偏移
if (forwardFace){
CTfaceforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]),
static_cast<const float *>(mCudaBuffers[3]),static_cast<const float *>(mCudaBuffers[4]),static_cast<float *>(cudaOutputBuffer),
input_w/4,input_h/4,classNum,kernelSize,visThresh);
} else{
CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]),
static_cast<const float *>(mCudaBuffers[3]),static_cast<float *>(cudaOutputBuffer),
input_w/4,input_h/4,classNum,kernelSize,visThresh,step);
// CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[3]),
// static_cast<const float *>(mCudaBuffers[2]),static_cast<float *>(cudaOutputBuffer),
// input_w/4,input_h/4,classNum,kernelSize,visThresh,step);
}
cudaDeviceSynchronize();
gettimeofday(&end1,NULL);
int timeuse1 = static_cast<int>(1000000 * (end1.tv_sec - start1.tv_sec ) + end1.tv_usec - start1.tv_usec);
float time1 = float(timeuse1)/1000;
std::cout << " batch post is: " << time1 << std::endl;
struct timeval start2, end2;
gettimeofday(&start2,NULL);
CUDA_CHECK(cudaMemcpy((void*)(outputData_b), cudaOutputBuffer,outputBufferSize, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
gettimeofday(&end2,NULL);
int timeuse2 = static_cast<int>(1000000 * (end2.tv_sec - start2.tv_sec ) + end2.tv_usec - start2.tv_usec);
float time2 = float(timeuse2)/1000;
std::cout << "batch cpHost is: " << time2 << std::endl;
}
ctdetLayer.cu文件
修改内容如下:
#include "ctdetLayer.h"
#include "utils.h"
#include "stdio.h"
dim3 cudaGridSize(uint n)
{
uint k = (n - 1) /BLOCK + 1;
uint x = k ;
uint y = 1 ;
if (x > 65535 )
{
x = ceil(sqrt(x));
y = (n - 1 )/(x*BLOCK) + 1;
}
dim3 d = {x,y,1} ;
return d;
}
__device__ float Logist(float data){ return 1./(1. + exp(-data));}
__global__ void PrintResultGPU(float *buffer, size_t unit_length){
printf("buffer values: %f, %f\n",*buffer, *(buffer+unit_length));
printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000));
printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1));
printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000));
printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1));
}
__global__ void CTdetforward_kernel(const float *hm, const float *reg,const float *wh , float* output,
const int w,const int h,const int classes,const int kernel_size,const float visthresh,int step)
{
int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; //gridIdx.x
int imageID= idx/(w*h*classes);//(w*h*c);
//printf("begin to run cuda kernel, currently at %d\n ", imageID);
//std::cout<<imageID<<std::endl;
// if(idx == 0){
// printf("input values: %f, %f, %f\n", hm[0], reg[0], wh[0]);
// }
if (idx >= 1*w * h * classes) return;
// if(idx == 0)
//
int padding = (kernel_size - 1) / 2;
int offset = -padding;
int stride = w * h;
int grid_x = (idx%(w*h*classes)) % w;
int grid_y = ((idx%(w*h*classes)) / w) % h;
int cls = (idx/w/h)%classes;
int l, m;
// int reg_index = idx - cls*stride-imageID*classes*stride; //修改,原有int reg_index = idx - cls*stride
int reg_index = idx - cls*stride-imageID*classes*stride+imageID*stride; //修改,原有int reg_index = idx - cls*stride
float c_x, c_y;
float objProb = Logist(hm[idx]);
if (objProb > visthresh) {
float max = -1;
int max_index = 0;
for (l = 0; l < kernel_size; ++l)
for (m = 0; m < kernel_size; ++m) {
int cur_x = offset + l + grid_x;
int cur_y = offset + m + grid_y;
int cur_index = cur_y * w + cur_x + stride * cls + imageID*classes*stride;
int valid = (cur_x >= 0 && cur_x < w && cur_y >= 0 && cur_y < h);
float val = (valid != 0) ? Logist(hm[cur_index]) : -1;
max_index = (val > max) ? cur_index : max_index;
max = (val > max) ? val : max;
}
if(idx == max_index){
//printf("begin to run cuda kernel, currently at %d\n ", imageID);
int resCount = (int) atomicAdd(output+imageID*step, 1);
// printf("cur rescount %d %f\n ", imageID, *(output+imageID*step));
// printf("resCount is %d",resCount);
char *data = (char *) (output+imageID*step) + sizeof(float) + resCount * sizeof(Detection);//+imageID*step*sizeof(float);
// int resCount = (int) atomicAdd(output, 1);
// printf("resCount is %d",resCount);
// char *data = (char *) output + sizeof(float) + resCount * sizeof(Detection);
Detection *det = (Detection *) (data);
c_x = grid_x + reg[reg_index];
c_y = grid_y + reg[reg_index + stride];
det->bbox.x1 = (c_x - wh[reg_index] / 2) * 4;
det->bbox.y1 = (c_y - wh[reg_index + stride] / 2) * 4;
det->bbox.x2 = (c_x + wh[reg_index] / 2) * 4;
det->bbox.y2 = (c_y + wh[reg_index + stride] / 2) * 4;
det->classId = cls;
det->prob = objProb;
}
}
// if(idx == w * h * classes -1){
// char *test = (char *)output + sizeof(float);
// printf("first float: %f\n", output[0]);
// Detection *det_t = (Detection *)test;
// printf("det_t prob: %f, %d, %f, %f, %f, %f\n", det_t->prob, det_t->classId, det_t->bbox.x1, det_t->bbox.y1, det_t->bbox.x2, det_t->bbox.y2);
// }
}
__global__ void CTfaceforward_kernel(const float *hm, const float *wh,const float *reg,const float* landmarks,
float *output,const int w,const int h,const int classes,const int kernel_size,const float visthresh ) {
int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
if (idx >= w*h*classes) return;
int padding = (kernel_size-1)/2;
int offset = - padding;
int stride = w * h;
int grid_x = idx % w;
int grid_y = (idx / w) % h;
int cls = idx/w/h ;
int reg_index = idx - cls*stride;
int l,m,mark_id;
float c_x,c_y,scale_w,scale_h;
float objProb = hm[idx];
float max=-1;
int max_index =0;
if(objProb > visthresh){
for(l=0 ;l < kernel_size ; ++l)
for(m=0 ; m < kernel_size ; ++m){
int cur_x = offset + l + grid_x;
int cur_y = offset + m + grid_y;
int cur_index = cur_y * w + cur_x + stride*cls;
int valid = (cur_x>=0 && cur_x < w && cur_y >=0 && cur_y <h );
float val = (valid !=0 ) ? hm[cur_index]: -1;
max_index = (val > max) ? cur_index : max_index;
max = (val > max ) ? val: max ;
}
if(idx == max_index){
int resCount = (int)atomicAdd(output,1);
//printf("%d",resCount);
char* data = (char * )output + sizeof(float) + resCount*sizeof(Detection);
Detection* det = (Detection*)(data);
c_x = (grid_x + reg[reg_index+stride] + 0.5)*4 ; c_y = (grid_y + reg[reg_index] + 0.5) * 4;
scale_w = expf(wh[reg_index+stride]) * 4 ; scale_h = expf(wh[reg_index]) * 4;
det->bbox.x1 = c_x - scale_w/2;
det->bbox.y1 = c_y - scale_h/2 ;
det->bbox.x2 = c_x + scale_w/2;
det->bbox.y2 = c_y + scale_h/2;
det->prob = objProb;
det->classId = cls;
for(mark_id=0 ; mark_id < 5 ; ++ mark_id){
det->marks[mark_id].x = det->bbox.x1 + landmarks[reg_index + (2*mark_id+1)*stride]*scale_w;
det->marks[mark_id].y = det->bbox.y1 + landmarks[reg_index + (2*mark_id)*stride]*scale_h;
}
}
}
}
void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float * output,
const int w,const int h,const int classes,const int kernerl_size, const float visthresh,const int step){
uint num = 1*w * h * classes;
CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh,step);
if(cudaGetLastError())printf("cuda kernel error!\n");
// CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh);
}
void PrintResult(float *buffer, size_t unit_length){
PrintResultGPU<<<1, 1>>>(buffer, unit_length);
cudaDeviceSynchronize();
}
//void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output,
// const int w,const int h,const int classes,const int kernerl_size, const float visthresh){
// uint num = w * h * classes;
// CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh);
//}
void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output,
const int w,const int h,const int classes,const int kernerl_size, const float visthresh ){
uint num = w * h * classes;
CTfaceforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,wh,reg,landmarks,output,w,h,classes,kernerl_size,visthresh);
}
ctdetLayer.h文件
修改内容如下:
//
// Created by cao on 19-10-25.
//
#ifndef CTDET_TRT_CTDETLAYER_H
#define CTDET_TRT_CTDETLAYER_H
//void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output,
// const int w,const int h,const int classes,const int kernerl_size,const float visthresh,cudaStream_t stream);
void PrintResult(float *buffer, size_t unit_length);
void CTdetforward_gpu(const float *hm, const float *reg,const float *wh , float * output,
const int w,const int h,const int classes,const int kernerl_size,
const float visthresh,const int step);
void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output,
const int w,const int h,const int classes,const int kernerl_size, const float visthresh );
#endif //CTDET_TRT_CTDETLAYER_H

欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。
更多推荐
所有评论(0)