下面展示一些 内联代码片


// 开辟内存的不同方法

   cufftComplex* DBF_Result = NULL; // 定义指向cufftComplex类型的指针
   // cudaMallocHost((void**)&DBF_Result, M * N * sizeof(cufftComplex)); // 申请内存空间
    //cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocMapped);   //锁业内存
   // cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocDefault);
    cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocWriteCombined);
   // cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocPortable);

//1. 开辟内存的不同方法cudaMallocManaged 测试

// 开辟内存的不同方法cudaMallocManaged

#include <stdio.h>
#include <windows.h>
#include"iostream"
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"
#include"cufft.h"
#include <stdio.h>
#include <windows.h>

int main()
{

    const int Nt = 1024*1024;
    const int BATCH = 1;
    cufftComplex* host_in, * host_out, * device_in, * device_out;

    LARGE_INTEGER frequency, start_time, end_time;
    double elapsed_time;

    QueryPerformanceFrequency(&frequency);

    QueryPerformanceCounter(&start_time);


    // create two events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    cudaMallocManaged((void**)&host_in, Nt * sizeof(cufftComplex));
    cudaMallocManaged((void**)&host_out, Nt * sizeof(cufftComplex));
    cudaDeviceSynchronize();

     cudaEventRecord(stop);
     cudaEventSynchronize(stop);         // 等到stop event完成

     float time;
     cudaEventElapsedTime(&time, start, stop);
     cudaEventDestroy(start);// clean up the two events
     cudaEventDestroy(stop);
     printf("GPU Time = %g ms.\n", time);

    for (int i = 0; i < Nt; i++)
    {
        host_in[i].x = i + 1;
        host_in[i].y = 0;

    }
    /*host_in[3].x = 0;
    host_in[3].y = 0;*/
    cufftHandle cufftForwrdHandle;
   
    int m;

    cudaDeviceSynchronize();
   // for (j = 2 * 1024; j <= 1024 * 1024; j *= 2) {

        //cufftPlan1d(&cufftForwrdHandle, 2 * 1024, CUFFT_C2C, BATCH);
        cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_C2C, BATCH);

        // start = what_time_is_it_now();
    //    double start = GetTickCount64(); //计时器

        int loop = 10;
        for (m = 0; m < loop; m++) {
            //执行fft正变换
            cufftExecC2C(cufftForwrdHandle, host_in, host_out, CUFFT_FORWARD);
        }
        cudaDeviceSynchronize();
        /*QueryPerformanceCounter(&end_time);
        elapsed_time = (double)(end_time.QuadPart - start_time.QuadPart) / frequency.QuadPart;
        printf("Elapsed time: %f milliseconds\n", (elapsed_time * 1000) / loop);*/
        for (int i = 0; i < 4; i++) {
      //     printf("%f+%f\n", host_out[i].x, host_out[i].y);
        }
      return 0;
}

// 2.开辟内存的不同方法cudaMallocHost

// 开辟内存的不同方法cudaMallocHost

 int main()
{
    const int Nt = 1024*1024;
    const int BATCH = 1;

    cufftComplex* host_in, * host_out, * device_in, * device_out;
    cudaMallocHost((void**)&host_in, Nt * sizeof(cufftComplex));
    cudaMallocHost((void**)&host_out, ( Nt+1) * sizeof(cufftComplex));

    for (int i = 0; i < Nt; i++)
    {
        host_in[i].x = i + 1;
        host_in[i].y = 0;

    }

    // create two events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    //设备内存申请
    cudaMalloc((void**)&device_in, (Nt)  * sizeof(cufftComplex));
    cudaMalloc((void**)&device_out, (Nt) * sizeof(cufftComplex));
    //数据传输--H2D
    cudaMemcpy(device_in, host_in, Nt * sizeof(cufftComplex), cudaMemcpyHostToDevice);

     cudaEventRecord(stop);
     cudaEventSynchronize(stop);         // 等到stop event完成

     float time;
     cudaEventElapsedTime(&time, start, stop);
     cudaEventDestroy(start);// clean up the two events
     cudaEventDestroy(stop);
     printf("GPU Time = %g ms.\n", time);

    cufftHandle cufftForwrdHandle;
    {

        cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_C2C, BATCH);

        cufftExecC2C(cufftForwrdHandle, device_in, device_out, CUFFT_FORWARD);

        cudaMemcpy(host_out, device_out, Nt * sizeof(cufftComplex), cudaMemcpyDeviceToHost);


        for (int i = 0; i < 10; i++) {
            printf("%f+%f\n", host_out[i].x, host_out[i].y);
        }
    }


    return 0;
Logo

欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。

更多推荐