cuda实战-resize（图像缩放）双线性插值算法实现

线性插值是一种简单且常用的插值方法，用于在已知数据点之间估算未知值。在图像处理领域，线性插值被广泛用于图像缩放、旋转、变形等操作，以在现有像素之间生成新的像素值，从而实现平滑过渡。在图像处理中的应用通常是二维的，因此需要扩展到双线性插值。从上图对双线性插值进行理解，目标像素P(x,y)的坐标值由与之最相邻的四个像素线性加权而成。优点简单易实现：线性插值算法非常简单，计算速度快。计算高效：相比更复杂

qq_43448134

1159人浏览 · 2024-09-09 20:47:35

qq_43448134 · 2024-09-09 20:47:35 发布

一、背景

图像缩放是指调整图像的尺寸，使其变大或变小的过程。在图像处理和计算机视觉领域，图像缩放是一个基础且常见的操作，涉及根据一定的算法对图像像素进行插值或重采样，以生成新的图像尺寸，同时尽量保持图像的质量和细节。

图像缩放的主要目的是在保持图像原始信息和视觉效果的前提下，适应不同的应用需求，比如显示设备的分辨率、数据传输速度、存储空间、以及后续的图像处理或分析任务。

该应用场景非常广泛，包括但不限于图像显示、机器学习中的数据增强、图像压缩、地图应用以及医学图像分析等领域。

二、线性插值理论的介绍

线性插值是一种简单且常用的插值方法，用于在已知数据点之间估算未知值。在图像处理领域，线性插值被广泛用于图像缩放、旋转、变形等操作，以在现有像素之间生成新的像素值，从而实现平滑过渡。

参考：https://zh.wikipedia.org/wiki/%E5%8F%8C%E7%BA%BF%E6%80%A7%E6%8F%92%E5%80%BC

在图像处理中的应用通常是二维的，因此需要扩展到双线性插值。

从上图对双线性插值进行理解，目标像素P(x,y)的坐标值由与之最相邻的四个像素线性加权而成。

优点：
- 简单易实现：线性插值算法非常简单，计算速度快。
- 计算高效：相比更复杂的插值方法（如双三次插值），线性插值的计算量较少，更适合实时应用。
- 效果相对平滑：可以在一定程度上避免最近邻插值可能产生的“块状”或“锯齿”效果。
缺点：
- 精度有限：线性插值假设数据点之间变化是线性的，对于实际图像中的复杂细节或高频信息，可能不够精确。
- 可能产生模糊：在放大图像时，线性插值会导致一些细节的丢失，图像变得模糊。

三、对应的cuda代码实现

3.1 头文件

#include <cuda_runtime.h>
#include <iostream>

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"

using namespace std;
typedef unsigned char uchar;

3.2 读取图像文件

// cpu 读取图像文件
unsigned char* read_image(const char* filename ){
    
    int width, height, channels;
    // 读取图像文件
    unsigned char* imageData = stbi_load(filename, &width, &height, &channels, 0);
    if (imageData == nullptr) {
        std::cerr << "Error: Could not load image " << filename << std::endl;
    }

    std::cout << "Image loaded: " << filename << std::endl;
    std::cout << "Width: " << width << " Height: " << height << " Channels: " << channels << std::endl;

    return imageData;
}

3.3 cuda launch函数

void bilinearInterpolation_launch(uchar3*  h_inputImageUChar3, 
                                  uchar3*  h_outputImageUChar3, 
                                  int inputWidth, int inputHeight, 
                                  int outputWidth, int outputHeight){
    uchar3* d_inputImage;
    uchar3* d_outputImage;

    size_t inputImageSize = inputWidth * inputHeight * sizeof(uchar3);
    size_t outputImageSize = outputWidth * outputHeight * sizeof(uchar3);
    cout << "sizeof(uchar3) = " << sizeof(uchar3) << endl;

    // cuda malloc && memset
    cudaMalloc(&d_inputImage, inputImageSize);
    cudaMalloc(&d_outputImage, outputImageSize);
    cudaMemset(d_inputImage, 0, inputImageSize);
    cudaMemset(d_outputImage, 0, outputImageSize);

    // h2d
    auto status = cudaMemcpy( d_inputImage, h_inputImageUChar3, inputImageSize, cudaMemcpyHostToDevice );
    cout << "h2d status = " << status << endl;

    float scaleX = (float)(inputWidth -  1) / outputWidth;
    float scaleY = (float)(inputHeight - 1) / outputHeight;

    // cuda block/grid size
    dim3 blockSize(16,16,1);
    dim3 gridSize( (outputWidth + blockSize.x -1) /blockSize.x, \
                     (outputHeight + blockSize.y -1) /blockSize.y,1  );
    cout << "blockSize: x =" << blockSize.x <<",y = " << blockSize.y <<",z ="<< blockSize.z << endl;
    cout << "gridSize: x = " << gridSize.x <<",y="<< gridSize.y <<",z = "<< gridSize.z<< endl;

    // 双线性插值算法
    bilinearInterpolationKernel<<<gridSize,blockSize >>>(d_inputImage,d_outputImage,inputWidth, inputHeight,outputWidth, outputHeight,scaleX,scaleY );


    // 同步设备
    cudaDeviceSynchronize();

    // 复制输出图像数据回主机
    cudaMemcpy(h_outputImageUChar3, d_outputImage, outputImageSize, cudaMemcpyDeviceToHost);

    // 释放设备内存
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

}

3.4 cuda 核函数

（包含一个global 和一个device函数（device函数有返回值））

// 计算每个像素rgb的插值结果
__device__ uchar3  bilinearInterpolation_test(float srcX, float srcY, 
                                        uchar3* d_inputImage, int inputWidth, int inputHeight,
                                        uchar3& res){
    // 找到周围的四个像素
    int x1 = (int)floor(srcX);
    int y1 = (int)floor(srcY);
    int x2 = min(x1 + 1, inputWidth - 1);
    int y2 = min(y1 + 1, inputHeight - 1);

    // 计算插值权重
    float wx = srcX - x1;
    float wy = srcY - y1;

    // 双线性插值计算（相邻四个点的像素值）
    uchar3 p1 = d_inputImage[y1 * inputWidth + x1];
    uchar3 p2 = d_inputImage[y1 * inputWidth + x2];
    uchar3 p3 = d_inputImage[y2 * inputWidth + x1];
    uchar3 p4 = d_inputImage[y2 * inputWidth + x2];

    uchar3 interpolated;
    // 插值计算
    interpolated.x = (uchar)((1 - wx) * (1 - wy) * p1.x + wx * (1 - wy) * p2.x + (1 - wx) * wy * p3.x + wx * wy * p4.x);
    interpolated.y = (uchar)((1 - wx) * (1 - wy) * p1.y + wx * (1 - wy) * p2.y + (1 - wx) * wy * p3.y + wx * wy * p4.y);
    interpolated.z = (uchar)((1 - wx) * (1 - wy) * p1.z + wx * (1 - wy) * p2.z + (1 - wx) * wy * p3.z + wx * wy * p4.z);
    return interpolated;
}

__global__ void bilinearInterpolationKernel(
                                  uchar3* d_inputImage, 
                                  uchar3* d_outputImage, 
                                  int inputWidth, int inputHeight, 
                                  int outputWidth, int outputHeight,
                                  float scaleX, float scaleY
){
    int x = blockIdx.x * blockDim.x + threadIdx.x; 
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if( x < outputWidth && y < outputHeight ){
        // 计算在源图像中位置
        float srcX = x * scaleX;
        float srcY = y * scaleY;
        
        uchar3 interpolated_tmp;
        uchar3 interpolated_tmp2 = bilinearInterpolation_test(  srcX, srcY,
                                d_inputImage,  inputWidth,   inputHeight,
                                interpolated_tmp);
        d_outputImage[(y *outputWidth + x )] = interpolated_tmp2;
    }
}

3.5 主函数

int main(){

    int inputWidth   = 640;
    int inputHeight  = 427;
    int outputWidth  = 320;
    int outputHeight = 213;

    // 读取图片
    const char* image_path = "../det_427_640.png";
    unsigned char* h_inputImage = read_image(image_path );

    // malloc host 
    unsigned char* h_outputImage = new unsigned char[outputWidth * outputHeight * 3];
    
    // 调用cuda launch函数
    bilinearInterpolation_launch(h_inputImage, h_outputImage, inputWidth, inputHeight, outputWidth, outputHeight);

    // save img 
    const char* output_filename = "../det_427_640_gpu_out.png";
    stbi_write_png( output_filename, outputWidth, outputHeight, 3, h_outputImage, outputWidth * 3);
    
    // free cpu 
    delete[] h_inputImage;
    delete[] h_outputImage;

    return 0;
}

3.6 CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(image_resize_cuda_proj)

set(CMAKE_CUDA_ARCHITECTURES ivcore11)

set(CMAKE_VERBOSE_MAKEFILE ON)

set(CMAKE_CXX_COMPILER /usr/local/corex/bin/clang++)

find_package(Torch REQUIRED)

add_executable(image_resize_bil bilinearInterpolation_resize.cu)
target_link_libraries(image_resize_bil "${TORCH_LIBRARIES}")