cuda上使用remap函数
如果自己编译的opencv带cuda,最好还是使用cv::cuda::remap函数,耗时较少。
·
在使用opencv中的remap函数时,发现运行时间太长了,如果使用视频流进行重映射时根本不能实时,因此只能加速
1.使用opencv里的cv::cuda::remap函数
cv::cuda::remap函数头文件是#include <opencv2/cudawarping.hpp>
,编译opencv时需要用cuda进行编译
//1.重映射矩阵转成cuda处理的数据格式
//map_x,map_y是重映射表,数据类型是CV_32FC1
cv::cuda::GpuMat m_mapx = ::cv::cuda::GpuMat(map_x);
cv::cuda::GpuMat m_mapy = ::cv::cuda::GpuMat(map_y);
//2.原图像转成cuda处理的数据格式
cv::cuda::GpuMat src(img);
//3.计算结果
cv::cuda::GpuMat gpuMat2;
cv::cuda::remap(src, gpuMat2, m_mapx, m_mapy, cv::INTER_LINEAR);
//4.结果转成Mat
cv::Mat dstimage;
gpuMat2.download(dstimage);
示例
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>
using namespace cv;
int main(int argc, char** argv)
{
Mat img = imread("image.jpg", IMREAD_COLOR);
if (img.empty())
{
std::cout << "Could not open the input image" << std::endl;
exit(1);
}
int in_width = img.cols;
int in_height = img.rows;
Mat map_x(in_height, in_width, CV_32FC1);
Mat map_y(in_height, in_width, CV_32FC1);
// 创建重映射映射表
for (int y = 0; y < in_height; y++)
{
for (int x = 0; x < in_width; x++)
{
map_x.at<float>(y, x) = (x + 20) / (float)in_width * in_width;
map_y.at<float>(y, x) = y / (float)in_height * in_height;
}
}
cv::cuda::GpuMat m_mapx = ::cv::cuda::GpuMat(map_x);
cv::cuda::GpuMat m_mapy = ::cv::cuda::GpuMat(map_y);
cv::cuda::GpuMat gpuMat1(img);
double time0 = static_cast<double>(cv::getTickCount());//记录起始时间
cv::cuda::GpuMat gpuMat2;
cv::cuda::remap(gpuMat1, gpuMat2, m_mapx, m_mapy, cv::INTER_LINEAR);
cv::Mat GPUimage;
gpuMat2.download(GPUimage);
time0 = ((double)cv::getTickCount() - time0) / cv::getTickFrequency();
std::cout << "GPU运行remap函数的时间为:" << time0 * 1000 << "ms" << std::endl;
double time1 = static_cast<double>(cv::getTickCount());//记录起始时间
cv::Mat CPUimage;
cv::remap(img, CPUimage, map_x, map_y, cv::INTER_LINEAR);
time1 = ((double)cv::getTickCount() - time1) / cv::getTickFrequency();
std::cout << "CPU运行remap函数的时间为:" << time1 * 1000 << "ms" << std::endl;
return 0;
}
经过实际运行,在我电脑上速度快了15倍左右
2.在cuda上重写remap函数
这是在csdn上看到的一篇文章上写的代码,在我的实际应用中变换的结果是错误的,由于我实际的应用时,我的图像输入尺寸和输出尺寸是不相同的,因此运行错误,但是在输入输出是相同尺寸时是正确的,因为使用了cv::cuda::remap,我也没修改这个程序。
建立.cu文件,可以生成静态库使用,也可以不生成使用
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <math.h>
__global__ void remap_kernel(const unsigned char* src, int src_width, int src_height,
unsigned char* dst, int dst_width, int dst_height,
const float* map_x, const float* map_y)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < dst_width && y < dst_height)
{
int index = (y * dst_width + x) * 3;
float src_x = map_x[index / 3];
float src_y = map_y[index / 3];
if (src_x >= 0 && src_x < src_width - 1 && src_y >= 0 && src_y < src_height - 1) {
int x0 = floorf(src_x);
int y0 = floorf(src_y);
int x1 = x0 + 1;
int y1 = y0 + 1;
float tx = src_x - x0;
float ty = src_y - y0;
int src_index00 = (y0 * src_width + x0) * 3;
int src_index10 = (y0 * src_width + x1) * 3;
int src_index01 = (y1 * src_width + x0) * 3;
int src_index11 = (y1 * src_width + x1) * 3;
for (int i = 0; i < 3; i++) {
float value00 = src[src_index00 + i];
float value10 = src[src_index10 + i];
float value01 = src[src_index01 + i];
float value11 = src[src_index11 + i];
float value0 = value00 * (1.0f - tx) + value10 * tx;
float value1 = value01 * (1.0f - tx) + value11 * tx;
float value = value0 * (1.0f - ty) + value1 * ty;
dst[index + i] = static_cast<unsigned char>(value);
}
}
}
}
extern "C" void remap_gpu(const unsigned char* in, int in_width, int in_height,
unsigned char* out, int out_width, int out_height,
const float* map_x, const float* map_y) {
unsigned char* d_in, * d_out;
float* d_map_x, * d_map_y;
cudaMalloc((void**)&d_in, in_width * in_height * 3);
cudaMalloc((void**)&d_out, out_width * out_height * 3);
cudaMalloc((void**)&d_map_x, out_width * out_height * sizeof(float));
cudaMalloc((void**)&d_map_y, out_width * out_height * sizeof(float));
cudaMemcpy(d_in, in, in_width * in_height * 3, cudaMemcpyHostToDevice);
cudaMemcpy(d_map_x, map_x, out_width * out_height * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_map_y, map_y, out_width * out_height * sizeof(float), cudaMemcpyHostToDevice);
dim3 block(32, 32, 1);
dim3 grid((out_width + block.x - 1) / block.x, (out_height + block.y - 1) / block.y, 1);
remap_kernel << <grid, block >> > (d_in, in_width, in_height, d_out, out_width, out_height, d_map_x, d_map_y);
cudaMemcpy(out, d_out, out_width * out_height * 3, cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_map_x);
cudaFree(d_map_y);
}
重新新建一个.cpp文件
#include <iostream>
#include <opencv2/opencv.hpp>
using namespace cv;
extern "C" void remap_gpu(const unsigned char* in, int in_width, int in_height,
unsigned char* out, int out_width, int out_height,
const float* map_x, const float* map_y);
int main(int argc, char** argv)
{
cv::Mat img = imread("image.jpg", IMREAD_COLOR);
if (img.empty())
{
std::cout << "Could not open the input image" << std::endl;
exit(1);
}
int in_width = img.cols;
int in_height = img.rows;
cv::Mat map_x(in_height, in_width, CV_32FC1);
cv::Mat map_y(in_height, in_width, CV_32FC1);
// 创建重映射映射表
for (int y = 0; y < in_height; y++)
{
for (int x = 0; x < in_width; x++)
{
map_x.at<float>(y, x) = (x + 20) / (float)in_width * in_width;
map_y.at<float>(y, x) = y / (float)in_height * in_height;
}
}
double time0 = static_cast<double>(cv::getTickCount());//记录起始时间
cv::Mat CPUimage;
remap(img, CPUimage, map_x, map_y, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
time0 = ((double)cv::getTickCount() - time0) / cv::getTickFrequency();
std::cout << "CPU 运行remap函数时间为:" << time0 * 1000 << "ms" << std::endl;
int out_width = in_width;
int out_height = in_height;
unsigned char* out = (unsigned char*)malloc(out_width * out_height * 3);
double time1 = static_cast<double>(cv::getTickCount());//记录起始时间
unsigned char* in = (unsigned char*)img.data;
remap_gpu(in, in_width, in_height, out, out_width, out_height, (float*)map_x.data, (float*)map_y.data);
cv::Mat GPUimage(out_height, out_width, CV_8UC3, out);
time1 = ((double)cv::getTickCount() - time1) / cv::getTickFrequency();
std::cout << "GPU 运行remap函数时间为:" << time1 * 1000 << "ms" << std::endl;
free(out);
return 0;
}
只运行一帧时cpu上运行的remap较快,运行多帧时,GPU上运行的remap函数要比CPU上运行快5倍左右
总结
如果自己编译的opencv带cuda,最好还是使用cv::cuda::remap函数,耗时较少
欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。
更多推荐
已为社区贡献1条内容
所有评论(0)