cuda实现均值滤波

lopiyi

363人浏览 · 2023-08-06 11:06:35

lopiyi · 2023-08-06 11:06:35 发布

窗口尺寸必须是大于1的奇数，窗口宽高可以不等。

#include <iostream>
#include<cuda.h>
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include<device_launch_parameters.h>
using namespace std;

// CUDA错误检查宏
#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		std::cerr << "CUDA Error: " << cudaGetErrorString(code) << " " << file << " " << line << std::endl;
		if (abort) exit(code);
	}
}
/ 定义块大小和窗口大小
#define BLOCK_SIZE 16


// CUDA核函数：中值滤波
__global__ void medianFilter(const unsigned char* input, unsigned char* output, const int width, const int height, const int channels,
							 const int kernelW, const int kernelH)
{
	// 计算当前线程的位置
	int col = blockIdx.x * blockDim.x + threadIdx.x;
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	//printf("blockIdx.y:%d\tblockIdx.x:%d\tblockDim.y:%d\tblockDim.x:%d\n", blockIdx.y, blockIdx.x,blockDim.y, blockDim.x);
	// 计算图像填充的大小
	const int paddingW = kernelW / 2;
	const int paddingH = kernelH / 2;

	// 计算像素在图像中的索引
	int index = (row * width + col) * channels;

	// 检查当前线程是否在图像范围内
	if (col < width && row < height)
	{
		// 计算中值滤波后的像素值
		unsigned char sortedWindow[30 * 30];
		

		// 将窗口内的像素复制到排序数组中
		for (int k = 0; k < channels; k++) {
			int count = 0;
			for (int i = -paddingH; i <= paddingH; i++) {
				for (int j = -paddingW; j <= paddingW; j++) {
					// 计算当前像素的位置
					int curRow = row + i;
					int curCol = col + j;

					// 边界处理：使用复制填充
					curRow = min(max(curRow, 0), height - 1);
					curCol = min(max(curCol, 0), width - 1);
					sortedWindow[count] = input[(curRow * width + curCol) * channels + k];
					count++;
				}
			}
			// 对窗口内的像素进行排序
			for (int i = 0; i < count - 1; i++) {
				for (int j = i + 1; j < count; j++) {
					if (sortedWindow[i] > sortedWindow[j]) {
						unsigned char temp = sortedWindow[i];
						sortedWindow[i] = sortedWindow[j];
						sortedWindow[j] = temp;
					}
				}
			}
			// 将中值像素复制到输出图像中
			output[index + k] = sortedWindow[count / 2];
			//printf("input:%d\tout:%d\n", input[index + k], output[index + k]);
		}
	}
}
void test3(void)
{
	int sz = 1048576 * 100;
	cudaDeviceSetLimit(cudaLimitPrintfFifoSize, sz);
	// 加载输入图像
	cv::Mat image = cv::imread("F:\\pic_data\\CBSD68\\3096.png", cv::IMREAD_COLOR);

	// 检查图像是否成功加载
	if (image.empty()) {
		std::cout << "Unable to read image" << std::endl;
		return ;
	}

	// 获取图像的宽度、高度和通道数
	int width = image.cols;
	int height = image.rows;
	int channels = image.channels();

	// 计算图像数据大小
	size_t imageSize = width * height * channels;

	// 分配主机内存并将输入图像数据复制到主机内存中
	unsigned char* hostInput = new unsigned char[imageSize];
	memcpy(hostInput, image.data, imageSize);

	// 分配设备内存并将输入图像数据复制到设备内存中
	unsigned char* deviceInput;
	cudaMalloc((void**)&deviceInput, imageSize);
	cudaMemcpy(deviceInput, hostInput, imageSize, cudaMemcpyHostToDevice);

	// 分配设备内存用于存储输出图像数据
	unsigned char* deviceOutput;
	cudaMalloc((void**)&deviceOutput, imageSize);

	// 计算线程块和网格的大小
	dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
	dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
	cout << gridSize.x << "\t" << gridSize.y << endl;
	// 
	int kernelW = 7;
	int kernelH = 7;
	int shareMemorySize = (BLOCK_SIZE + 2 * (kernelW / 2))*(BLOCK_SIZE + 2 * (kernelH / 2))*channels;
	// 调用CUDA核函数进行中值滤波处理
	medianFilter << <gridSize, blockSize >> > (deviceInput, deviceOutput, width, height, channels, kernelW, kernelH);

	// 将结果从设备内存复制回主机内存
	unsigned char* hostOutput = new unsigned char[imageSize];
	cudaMemcpy(hostOutput, deviceOutput, imageSize, cudaMemcpyDeviceToHost);

	// 创建输出图像
	cv::Mat output(height, width, image.type());
	memcpy(output.data, hostOutput, imageSize);

	// 显示输入和输出图像
	cv::imshow("Input Image", image);
	cv::imshow("Output Image", output);
	cv::waitKey(0);
	cv::destroyAllWindows();
	// 释放内存
	delete[] hostInput;
	delete[] hostOutput;
	cudaFree(deviceInput);
	cudaFree(deviceOutput);
}

Triton中文社区

欢迎来到由智源人工智能研究院发起的Triton中文社区，这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂，为推动人工智能技术的普及与深化应用贡献力量。

更多推荐

torch.device(‘cuda:0‘ if cuda else ‘cpu‘)

# 这个地方的cuda:0 实际上并不是0号GPU，他取决于CUDA_VISIBLE_DEVICES# 然后逻辑GPU和物理GPU有一个对应关系# 如果CUDA_VISIBLE_DEVICES为2,1,3# 那么CUDA:0就是2号GPU， CUDA:1 就是1号GPU CUDA:3 就是3号GPUreturn torch.device('cuda:0' if cuda else 'cpu')..