cuda cudaMallocManaged 测试
/1. 开辟内存的不同方法cudaMallocManaged 测试。// 2.开辟内存的不同方法cudaMallocHost。
·
下面展示一些 内联代码片
。
// 开辟内存的不同方法
cufftComplex* DBF_Result = NULL; // 定义指向cufftComplex类型的指针
// cudaMallocHost((void**)&DBF_Result, M * N * sizeof(cufftComplex)); // 申请内存空间
//cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocMapped); //锁业内存
// cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocDefault);
cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocWriteCombined);
// cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocPortable);
//1. 开辟内存的不同方法cudaMallocManaged 测试
// 开辟内存的不同方法cudaMallocManaged
#include <stdio.h>
#include <windows.h>
#include"iostream"
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"
#include"cufft.h"
#include <stdio.h>
#include <windows.h>
int main()
{
const int Nt = 1024*1024;
const int BATCH = 1;
cufftComplex* host_in, * host_out, * device_in, * device_out;
LARGE_INTEGER frequency, start_time, end_time;
double elapsed_time;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&start_time);
// create two events
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaMallocManaged((void**)&host_in, Nt * sizeof(cufftComplex));
cudaMallocManaged((void**)&host_out, Nt * sizeof(cufftComplex));
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop); // 等到stop event完成
float time;
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);// clean up the two events
cudaEventDestroy(stop);
printf("GPU Time = %g ms.\n", time);
for (int i = 0; i < Nt; i++)
{
host_in[i].x = i + 1;
host_in[i].y = 0;
}
/*host_in[3].x = 0;
host_in[3].y = 0;*/
cufftHandle cufftForwrdHandle;
int m;
cudaDeviceSynchronize();
// for (j = 2 * 1024; j <= 1024 * 1024; j *= 2) {
//cufftPlan1d(&cufftForwrdHandle, 2 * 1024, CUFFT_C2C, BATCH);
cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_C2C, BATCH);
// start = what_time_is_it_now();
// double start = GetTickCount64(); //计时器
int loop = 10;
for (m = 0; m < loop; m++) {
//执行fft正变换
cufftExecC2C(cufftForwrdHandle, host_in, host_out, CUFFT_FORWARD);
}
cudaDeviceSynchronize();
/*QueryPerformanceCounter(&end_time);
elapsed_time = (double)(end_time.QuadPart - start_time.QuadPart) / frequency.QuadPart;
printf("Elapsed time: %f milliseconds\n", (elapsed_time * 1000) / loop);*/
for (int i = 0; i < 4; i++) {
// printf("%f+%f\n", host_out[i].x, host_out[i].y);
}
return 0;
}
// 2.开辟内存的不同方法cudaMallocHost
// 开辟内存的不同方法cudaMallocHost
int main()
{
const int Nt = 1024*1024;
const int BATCH = 1;
cufftComplex* host_in, * host_out, * device_in, * device_out;
cudaMallocHost((void**)&host_in, Nt * sizeof(cufftComplex));
cudaMallocHost((void**)&host_out, ( Nt+1) * sizeof(cufftComplex));
for (int i = 0; i < Nt; i++)
{
host_in[i].x = i + 1;
host_in[i].y = 0;
}
// create two events
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
//设备内存申请
cudaMalloc((void**)&device_in, (Nt) * sizeof(cufftComplex));
cudaMalloc((void**)&device_out, (Nt) * sizeof(cufftComplex));
//数据传输--H2D
cudaMemcpy(device_in, host_in, Nt * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop); // 等到stop event完成
float time;
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);// clean up the two events
cudaEventDestroy(stop);
printf("GPU Time = %g ms.\n", time);
cufftHandle cufftForwrdHandle;
{
cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_C2C, BATCH);
cufftExecC2C(cufftForwrdHandle, device_in, device_out, CUFFT_FORWARD);
cudaMemcpy(host_out, device_out, Nt * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; i++) {
printf("%f+%f\n", host_out[i].x, host_out[i].y);
}
}
return 0;
欢迎来到由智源人工智能研究院发起的Triton中文社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。
更多推荐
已为社区贡献1条内容
所有评论(0)