项目背景:
旨在通过一系列图像算法的cuda实现,弄清楚算法原理和对应的cuda程序编程代码。
最近邻算法原理:

核心思想是根据目标图像的像素坐标映射回原始图像,选择离映射坐标最近的一个像素作为结果。
假设源图像的宽高为H,W,通道为C,目标图像的长宽为h,w,通道数为c。则目标图像的在像素坐标(x,y)的像素值P(x,y)的计算方式:

  • 1)横坐标缩放比例scale_X = W / w,纵坐标缩放比例scale_Y = H / h
  • 2)目标图像取得原图像的横坐标值x1=round(x * scale_X) ,目标图像取得原图像的横坐标值y1= round(y * scale_Y)
  • 3)因此 P(x,y) = Ori(x1,y1);
对应的代码实现:
1、头文件
#include <cuda_runtime.h>
#include <iostream>

// 读取和写入图像
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"

using namespace std;
typedef unsigned char uchar;
2、CPU读取图像
unsigned char* read_image(const char* filename ){
    int width, height, channels;
    // 读取图像文件
    unsigned char* imageData = stbi_load(filename, &width, &height, &channels, 0);
    if (imageData == nullptr) {
        std::cerr << "Error: Could not load image " << filename << std::endl;
    }
    std::cout << "Image loaded: " << filename << std::endl;
    std::cout << "Width: " << width << " Height: " << height << " Channels: " << channels << std::endl;
    return imageData;
}
3、最近邻算法cpu实现
void resize_cpu(unsigned char* ori_img,
                          unsigned char* dst_img,
                          int input_w,int input_h,
                          int out_w,int out_h){
    const int channel =3;
    // 计算缩放比例
    float scale_w = static_cast<float>(input_w / out_w);
    float scale_h = static_cast<float>(input_h / out_h);
    
    size_t size_total = out_w * out_h * 3;
    for(int y = 0; y < out_h ; y++){
        for(int x = 0; x < out_w; x++){

            // 计算源图像中最接近的像素位置
            int srcX = static_cast<int> (x * scale_w);
            int srcY = static_cast<int> (y * scale_w);

            // 防止越界
            srcX = min( srcX, input_w -1 );
            srcY = min( srcY, input_h -1 );

            // 计算原图像和目标图像中的像素索引
            int srcIndex = (srcY * input_w + srcX) * channel;
            int dstIndex = (y * out_w + x) * channel;

            for( int c = 0; c < channel; c++ ){
                dst_img[dstIndex + c] = ori_img[srcIndex + c];
            }
        }
    }
}
4、cuda 核函数
__global__ void nearestNeighborKernel(unsigned char* d_inputImage, 
                                  unsigned char* d_outputImage, 
                                  int inputWidth, int inputHeight, 
                                  int outputWidth, int outputHeight){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if( x < outputWidth && y < outputHeight ){
        // 计算在输入图像中位置
        int srcX = static_cast<int>(x / (float)outputWidth  * inputWidth);
        int srcY = static_cast<int>(y / (float)outputHeight *inputHeight );
        // rgb三个通道分别取对应的像素值,rgb三个数据相邻
        d_outputImage[(y *outputWidth + x )*3] = d_inputImage[(srcY * inputWidth + srcX) *3];
        d_outputImage[(y *outputWidth + x )*3 + 1] = d_inputImage[(srcY * inputWidth + srcX) *3 + 1];
        d_outputImage[(y *outputWidth + x )*3 + 2] = d_inputImage[(srcY * inputWidth + srcX) *3 + 2];
    }
}
5、cuda launch核函数
void nearestNeighborInterpolation_launch(unsigned char* h_inputImage, 
                                  unsigned char* h_outputImage, 
                                  int inputWidth, int inputHeight, 
                                  int outputWidth, int outputHeight){
    unsigned char * d_inputImage;
    unsigned char * d_outputImage;

    size_t inputImageSize = inputWidth * inputHeight * 3 * sizeof(unsigned char);
    size_t outputImageSize = outputWidth * outputHeight * 3 * sizeof(unsigned char);
    cout << "sizeof(unsigned char) = " << sizeof(unsigned char) << endl;

    // cuda malloc && memset
    cudaMalloc(&d_inputImage, inputImageSize);
    cudaMalloc(&d_outputImage, outputImageSize);
    cudaMemset(d_inputImage, 0, inputImageSize);
    cudaMemset(d_outputImage, 0, outputImageSize);

    // h2d
    auto status = cudaMemcpy( d_inputImage, h_inputImage, inputImageSize, cudaMemcpyHostToDevice );
    cout << "h2d status = " << status << endl;
    cout << "outputWidth = " << outputWidth << ",outputHeight = " << outputHeight <<endl;

    // cuda block/grid size
    dim3 blockSize(16,16,1);
    dim3 gridSize( (outputWidth + blockSize.x -1) /blockSize.x, \
                     (outputHeight + blockSize.y -1) /blockSize.y,1  );
    cout << "blockSize: x =" << blockSize.x <<",y = " << blockSize.y <<",z ="<< blockSize.z << endl;
    cout << "gridSize: x = " << gridSize.x <<",y="<< gridSize.y <<",z = "<< gridSize.z<< endl;

    // launch cuda kernel
    // 最近邻插值
    nearestNeighborKernel<<<gridSize,blockSize >>>(d_inputImage,d_outputImage,inputWidth, inputHeight,outputWidth, outputHeight );

    // 同步设备
    cudaDeviceSynchronize();

    // 复制输出图像数据回主机
    cudaMemcpy(h_outputImage, d_outputImage, outputImageSize, cudaMemcpyDeviceToHost);

    // 释放设备内存
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

}
6、主函数
int main(){

    int inputWidth   = 640;
    int inputHeight  = 427;
    int outputWidth  = 320;
    int outputHeight = 213;

    const char* image_path = "../det_427_640.png";


    // malloc host 
    unsigned char* h_inputImage = read_image(image_path );
    unsigned char* h_outputImage = new unsigned char[outputWidth * outputHeight * 3];

    // gpu impl(最近邻)
    nearestNeighborInterpolation_launch(h_inputImage, h_outputImage, inputWidth, inputHeight, outputWidth, outputHeight);


    // cpu impl
    // resize_cpu( h_inputImage, h_outputImage, inputWidth, inputHeight, outputWidth, outputHeight );

    // save img 
    const char* output_filename = "../det_427_640_gpu_out.png";
    stbi_write_png( output_filename, outputWidth, outputHeight, 3, h_outputImage, outputWidth * 3);
    // free cpu 
    delete[] h_inputImage;
    delete[] h_outputImage;
    return 0;
}
最终结果

源图像:尺寸=427*640
请添加图片描述
目标图像:尺寸=213 * 320
请添加图片描述

Logo

欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。

更多推荐