opecv cuda加速官方教程2：Using a cv::cuda::GpuMat with thrust

原文地址Global推力（ thrust）是一个非常强大的库各种cuda加速算法。然而，推力设计用于矢量而不是倾斜矩阵。下面的教程将讨论如何将cv::cuda::GpuMat包装到可用于推力算法的推力迭代器中。本教程将向您展示如何：将GpuMat包装到一个推力迭代器中用随机数填充GpuMat对GpuMat的列进行排序将大于0的值复制到新的gpu矩阵使用带推力的流Wrapping a GpuMat

日暮乡关何处是

651人浏览 · 2021-06-05 12:39:58

日暮乡关何处是 · 2021-06-05 12:39:58 发布

原文地址

Global

推力（ thrust）是一个非常强大的库各种cuda加速算法。然而，推力设计用于矢量而不是倾斜矩阵。下面的教程将讨论如何将cv::cuda::GpuMat包装到可用于推力算法的推力迭代器中。
本教程将向您展示如何：

将GpuMat包装到一个推力迭代器中
用随机数填充GpuMat
对GpuMat的列进行排序
将大于0的值复制到新的gpu矩阵
使用带推力的流

Wrapping a GpuMat into a thrust iterator

下面的代码将为GpuMat生成一个迭代器

/*
    @Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.
    @Param mat is the input matrix
    @Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
*/
template<typename T>
thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatBeginItr(cv::cuda::GpuMat mat, int channel = 0)
{
    if (channel == -1)
    {
        mat = mat.reshape(1);
        channel = 0;
    }
    CV_Assert(mat.depth() == cv::DataType<T>::depth);
    CV_Assert(channel < mat.channels());
    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
        thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
}

/*
@Brief GpuMatEndItr returns a thrust compatible iterator to the end of a GPU mat's memory.
@Param mat is the input matrix
@Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
*/
template<typename T>
thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatEndItr(cv::cuda::GpuMat mat, int channel = 0)
{
    if (channel == -1)
    {
        mat = mat.reshape(1);
        channel = 0;
    }
    CV_Assert(mat.depth() == cv::DataType<T>::depth);
    CV_Assert(channel < mat.channels());
    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
        thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
}

我们的目标是拥有一个从矩阵开始的迭代器，并正确地递增以访问连续的矩阵元素。这对于一个连续的行来说是微不足道的，但是对于一个倾斜矩阵的列呢？为此，我们需要迭代器知道矩阵的维数和步长。这个信息被嵌入到step_functor中。

template<typename T> struct step_functor : public thrust::unary_function<int, int>
{
    int columns;
    int step;
    int channels;
    __host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_)  {   };
    __host__ step_functor(cv::cuda::GpuMat& mat)
    {
        CV_Assert(mat.depth() == cv::DataType<T>::depth);
        columns = mat.cols;
        step = mat.step / sizeof(T);
        channels = mat.channels();
    }
    __host__ __device__
        int operator()(int x) const
    {
        int row = x / columns;
        int idx = (row * step) + (x % columns)*channels;
        return idx;
    }
};

step functor 接受一个索引值并返回从矩阵开始的适当偏移量。计数迭代器只是在像素元素的范围内递增。结合到transform_迭代器中，我们有一个迭代器，它从0到M*N计数，并正确地递增以说明GpuMat的倾斜内存。不幸的是，这不包括任何内存位置信息，为此我们需要一个 thrust::device_ptr。通过将设备指针与转换迭代器相结合，我们可以将推力指向矩阵的第一个元素，并相应地进行步进。

Fill a GpuMat with random numbers

现在我们有了一些很好的函数来生成用于推力的迭代器，让我们用它们来做一些OpenCV做不到的事情。不幸的是，在撰写本文时，OpenCV没有任何Gpu随机数生成。谢天谢地，struch确实做到了，现在两者之间的互操作变得微不足道了。示例取自http://stackoverflow.com/questions/12614164/generating-a-random-number-vector-between-0-and-1-0-using-thrust
首先，我们需要编写一个函子来生成我们的随机值。

struct prg
{
  float a, b;
  __host__ __device__
    prg(float _a = 0.f, float _b = 1.f) : a(_a), b(_b) {};
  __host__ __device__
    float operator()(const unsigned int n) const
  {
    thrust::default_random_engine rng;
    thrust::uniform_real_distribution<float> dist(a, b);
    rng.discard(n);
    return dist(rng);
  }
};

这将接受一个整数值，并输出一个介于a和b之间的值。现在我们将通过推力变换用0到10之间的值填充我们的矩阵.

  {
    cv::cuda::GpuMat d_value(1, 100, CV_32F);
    auto valueBegin = GpuMatBeginItr<float>(d_value);
    auto valueEnd = GpuMatEndItr<float>(d_value);
    thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
    cv::Mat h_value(d_value);
  }

Sort a column of a GpuMat in place

让我们用随机值和索引填充矩阵元素。之后我们将对随机数和索引进行排序。

  {
    cv::cuda::GpuMat d_data(1, 100, CV_32SC2);
    // Thrust compatible begin and end iterators to channel 1 of this matrix
    auto keyBegin = GpuMatBeginItr<int>(d_data, 1);
    auto keyEnd = GpuMatEndItr<int>(d_data, 1);
    // Thrust compatible begin and end iterators to channel 0 of this matrix
    auto idxBegin = GpuMatBeginItr<int>(d_data, 0);
    auto idxEnd = GpuMatEndItr<int>(d_data, 0);
    // Fill the index channel with a sequence of numbers from 0 to 100
    thrust::sequence(idxBegin, idxEnd);
    // Fill the key channel with random numbers between 0 and 10.  A counting iterator is used here to give an integer value for each location as an input to prg::operator()
    thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_data.cols), keyBegin, prg(0, 10));
    // Sort the key channel and index channel such that the keys and indecies stay together
    thrust::sort_by_key(keyBegin, keyEnd, idxBegin);
    cv::Mat h_idx(d_data);
  }

Copy values greater than 0 to a new gpu matrix while using streams

在这个例子中，我们将看到cv::cuda::Streams如何与推力一起使用。不幸的是，这个特定的示例使用的函数必须将结果返回给CPU，因此它不是流的最佳使用。

  {
    cv::cuda::GpuMat d_value(1, 100, CV_32F);
    auto valueBegin = GpuMatBeginItr<float>(d_value);
    auto valueEnd = GpuMatEndItr<float>(d_value);
    cv::cuda::Stream stream;
    //! [random_gen_stream]
    // Same as the random generation code from before except now the transformation is being performed on a stream
    thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
    //! [random_gen_stream]
    // Count the number of values we are going to copy
    int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
    // Allocate a destination for copied values
    cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
    // Copy values that satisfy the predicate.
    thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
    cv::Mat h_greater(d_valueGreater);
  }

首先，我们将用流上-1和1之间随机生成的数据填充GPU mat。

// Same as the random generation code from before except now the transformation is being performed on a stream
    thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));

注意使用推力:: system :: cuda :: par.on（…），这会创建一个执行策略，用于在流上执行推力代码。由于版本7.5，与CUDA Toolkit一起分发的推力版本中存在一个错误，这尚未修复。此错误会导致代码无法在流上执行。但是，可以通过使用从Git存储库的最新版本的推力来修复该错误。（http://github.com/thrusththust.git）接下来我们将通过使用以下谓词使用 thrust::count_if来确定多于0的值大于0：

template<typename T> struct pred_greater
{
  T value;
  __host__ __device__ pred_greater(T value_) : value(value_){}
  __host__ __device__ bool operator()(const T& val) const
  {
    return val > value;
  }
};

我们将使用这些结果创建一个输出缓冲区来存储复制的值，然后使用具有相同谓词的copy_if来填充输出缓冲区。最后，我们将把这些值下载到CPU mat中以供查看。

Triton中文社区

欢迎来到由智源人工智能研究院发起的Triton中文社区，这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂，为推动人工智能技术的普及与深化应用贡献力量。

更多推荐

torch.device(‘cuda:0‘ if cuda else ‘cpu‘)

# 这个地方的cuda:0 实际上并不是0号GPU，他取决于CUDA_VISIBLE_DEVICES# 然后逻辑GPU和物理GPU有一个对应关系# 如果CUDA_VISIBLE_DEVICES为2,1,3# 那么CUDA:0就是2号GPU， CUDA:1 就是1号GPU CUDA:3 就是3号GPUreturn torch.device('cuda:0' if cuda else 'cpu')..