cuda实战-resize(图像缩放)最近邻算法实现
最近邻算法原理及对应的cuda实现
·
项目背景:
旨在通过一系列图像算法的cuda实现,弄清楚算法原理和对应的cuda程序编程代码。
最近邻算法原理:
核心思想是根据目标图像的像素坐标映射回原始图像,选择离映射坐标最近的一个像素作为结果。
假设源图像的宽高为H,W,通道为C,目标图像的长宽为h,w,通道数为c。则目标图像的在像素坐标(x,y)的像素值P(x,y)的计算方式:
- 1)横坐标缩放比例scale_X = W / w,纵坐标缩放比例scale_Y = H / h
- 2)目标图像取得原图像的横坐标值x1=round(x * scale_X) ,目标图像取得原图像的横坐标值y1= round(y * scale_Y)
- 3)因此 P(x,y) = Ori(x1,y1);
对应的代码实现:
1、头文件
#include <cuda_runtime.h>
#include <iostream>
// 读取和写入图像
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
using namespace std;
typedef unsigned char uchar;
2、CPU读取图像
unsigned char* read_image(const char* filename ){
int width, height, channels;
// 读取图像文件
unsigned char* imageData = stbi_load(filename, &width, &height, &channels, 0);
if (imageData == nullptr) {
std::cerr << "Error: Could not load image " << filename << std::endl;
}
std::cout << "Image loaded: " << filename << std::endl;
std::cout << "Width: " << width << " Height: " << height << " Channels: " << channels << std::endl;
return imageData;
}
3、最近邻算法cpu实现
void resize_cpu(unsigned char* ori_img,
unsigned char* dst_img,
int input_w,int input_h,
int out_w,int out_h){
const int channel =3;
// 计算缩放比例
float scale_w = static_cast<float>(input_w / out_w);
float scale_h = static_cast<float>(input_h / out_h);
size_t size_total = out_w * out_h * 3;
for(int y = 0; y < out_h ; y++){
for(int x = 0; x < out_w; x++){
// 计算源图像中最接近的像素位置
int srcX = static_cast<int> (x * scale_w);
int srcY = static_cast<int> (y * scale_w);
// 防止越界
srcX = min( srcX, input_w -1 );
srcY = min( srcY, input_h -1 );
// 计算原图像和目标图像中的像素索引
int srcIndex = (srcY * input_w + srcX) * channel;
int dstIndex = (y * out_w + x) * channel;
for( int c = 0; c < channel; c++ ){
dst_img[dstIndex + c] = ori_img[srcIndex + c];
}
}
}
}
4、cuda 核函数
__global__ void nearestNeighborKernel(unsigned char* d_inputImage,
unsigned char* d_outputImage,
int inputWidth, int inputHeight,
int outputWidth, int outputHeight){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if( x < outputWidth && y < outputHeight ){
// 计算在输入图像中位置
int srcX = static_cast<int>(x / (float)outputWidth * inputWidth);
int srcY = static_cast<int>(y / (float)outputHeight *inputHeight );
// rgb三个通道分别取对应的像素值,rgb三个数据相邻
d_outputImage[(y *outputWidth + x )*3] = d_inputImage[(srcY * inputWidth + srcX) *3];
d_outputImage[(y *outputWidth + x )*3 + 1] = d_inputImage[(srcY * inputWidth + srcX) *3 + 1];
d_outputImage[(y *outputWidth + x )*3 + 2] = d_inputImage[(srcY * inputWidth + srcX) *3 + 2];
}
}
5、cuda launch核函数
void nearestNeighborInterpolation_launch(unsigned char* h_inputImage,
unsigned char* h_outputImage,
int inputWidth, int inputHeight,
int outputWidth, int outputHeight){
unsigned char * d_inputImage;
unsigned char * d_outputImage;
size_t inputImageSize = inputWidth * inputHeight * 3 * sizeof(unsigned char);
size_t outputImageSize = outputWidth * outputHeight * 3 * sizeof(unsigned char);
cout << "sizeof(unsigned char) = " << sizeof(unsigned char) << endl;
// cuda malloc && memset
cudaMalloc(&d_inputImage, inputImageSize);
cudaMalloc(&d_outputImage, outputImageSize);
cudaMemset(d_inputImage, 0, inputImageSize);
cudaMemset(d_outputImage, 0, outputImageSize);
// h2d
auto status = cudaMemcpy( d_inputImage, h_inputImage, inputImageSize, cudaMemcpyHostToDevice );
cout << "h2d status = " << status << endl;
cout << "outputWidth = " << outputWidth << ",outputHeight = " << outputHeight <<endl;
// cuda block/grid size
dim3 blockSize(16,16,1);
dim3 gridSize( (outputWidth + blockSize.x -1) /blockSize.x, \
(outputHeight + blockSize.y -1) /blockSize.y,1 );
cout << "blockSize: x =" << blockSize.x <<",y = " << blockSize.y <<",z ="<< blockSize.z << endl;
cout << "gridSize: x = " << gridSize.x <<",y="<< gridSize.y <<",z = "<< gridSize.z<< endl;
// launch cuda kernel
// 最近邻插值
nearestNeighborKernel<<<gridSize,blockSize >>>(d_inputImage,d_outputImage,inputWidth, inputHeight,outputWidth, outputHeight );
// 同步设备
cudaDeviceSynchronize();
// 复制输出图像数据回主机
cudaMemcpy(h_outputImage, d_outputImage, outputImageSize, cudaMemcpyDeviceToHost);
// 释放设备内存
cudaFree(d_inputImage);
cudaFree(d_outputImage);
}
6、主函数
int main(){
int inputWidth = 640;
int inputHeight = 427;
int outputWidth = 320;
int outputHeight = 213;
const char* image_path = "../det_427_640.png";
// malloc host
unsigned char* h_inputImage = read_image(image_path );
unsigned char* h_outputImage = new unsigned char[outputWidth * outputHeight * 3];
// gpu impl(最近邻)
nearestNeighborInterpolation_launch(h_inputImage, h_outputImage, inputWidth, inputHeight, outputWidth, outputHeight);
// cpu impl
// resize_cpu( h_inputImage, h_outputImage, inputWidth, inputHeight, outputWidth, outputHeight );
// save img
const char* output_filename = "../det_427_640_gpu_out.png";
stbi_write_png( output_filename, outputWidth, outputHeight, 3, h_outputImage, outputWidth * 3);
// free cpu
delete[] h_inputImage;
delete[] h_outputImage;
return 0;
}
最终结果
源图像:尺寸=427*640
目标图像:尺寸=213 * 320
欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。
更多推荐
已为社区贡献2条内容
所有评论(0)