OpenACC parallel
2024-10-13 06:27:55
▶ 使用 kernels 导语并行化 for 循环
● 同一段代码,使用 kernels,parallel 和 parallel + loop 进行对比
#include <stdio.h>
#include <time.h>
#include <openacc.h> const int row = ; int main()
{
int i, j, k, a[row], b[row], c[row];
clock_t time;
for (i = ; i < row; i++)
a[i] = b[i] = i; #ifdef _OPENACC
time = clock();
#pragma acc kernels // 使用 kernels 或 parallel 或 parallel + loop
// #pragma acc parallel
// #pragma acc loop
for (i = ; i < row; i++)
c[i] = a[i] + b[i];
time = clock() - time;
printf("\nTime with acc:%d ms\n", time);
#else
time = clock();
for (i = ; i < row; i++)
c[i] = a[i] + b[i];
time = clock() - time;
printf("\nTime without acc:%d ms\n", time);
#endif
getchar();
return ;
}
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_kernels.exe // kernels
main:
, Generating implicit copyin(b[:row])
Generating implicit copyout(c[:row])
Generating implicit copyin(a[:row])
, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_parallel.exe // parallel
main:
, Accelerator kernel generated
Generating Tesla code
, #pragma acc loop vector(128) /* threadIdx.x */
, Generating implicit copyout(c[:row])
Generating implicit copyin(b[:row],a[:row])
, Loop is parallelizable D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_parallel_loop.exe // parallel + loop
main:
, Accelerator kernel generated
Generating Tesla code
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, Generating implicit copyout(c[:row])
Generating implicit copyin(b[:row],a[:row]) D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_kernels.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits//Include/10.0.16299.0/ucrt\time.h function=main
line= device= threadid= num_gangs= num_workers= vector_length= grid= block= // 多个 gang,自动配置,线程网格全都是一维的 Time with acc: ms D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_parallel.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits//Include/10.0.16299.0/ucrt\time.h function=main
line= device= threadid= num_gangs= num_workers= vector_length= grid= block= // 一个 gang,gang冗余模式 Time with acc: ms D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_parallel_loop.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits//Include/10.0.16299.0/ucrt\time.h function=main
line= device= threadid= num_gangs= num_workers= vector_length= grid= block= // 多个 gang,gang分裂模式 Time with acc: ms
● 二重循环,考虑是否在内层循环中使用 loop 导语
#include <stdio.h>
#include <time.h>
#include <openacc.h> const int row = , col = ; int main()
{
int i, j, k, a[row][col], b[row][col], c[row][col];
clock_t time;
for (i = ; i < row; i++)
{
for (j = ; j < col; j++)
a[i][j] = b[i][j] = i + j;
} #ifdef _OPENACC
time = clock();
#pragma acc parallel
#pragma acc loop
for (i = ; i < row; i++)
{
// #pragma acc loop
for (j = ; j < col; j++)
c[i][j] = a[i][j] + b[i][j];
}
time = clock() - time;
printf("\nTime with acc:%d ms\n", time);
#else
time = clock();
for (i = ; i < row; i++)
{
for (j = ; j < col; j++)
c[i][j] = a[i][j] + b[i][j];
}
time = clock() - time;
printf("\nTime without acc:%d ms\n", time);
#endif
getchar();
return ;
}
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop1.exe // 仅使用外层 loop
main:
, Accelerator kernel generated
Generating Tesla code
, #pragma acc loop gang /* blockIdx.x */
, #pragma acc loop vector(128) /* threadIdx.x */
, Generating implicit copyin(a[:row][:col])
Generating implicit copyout(c[:row][:col])
Generating implicit copyin(b[:row][:col])
, Loop is parallelizable D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop2.exe // 内外都使用 loop,优化结果完全相同
main:
, Accelerator kernel generated
Generating Tesla code
, #pragma acc loop gang /* blockIdx.x */
, #pragma acc loop vector(128) /* threadIdx.x */
, Generating implicit copyin(a[:row][:col])
Generating implicit copyout(c[:row][:col])
Generating implicit copyin(b[:row][:col])
, Loop is parallelizable D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop1.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits//Include/10.0.16299.0/ucrt\time.h function=main
line= device= threadid= num_gangs= num_workers= vector_length= grid= block= Time with acc: ms D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop2.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits//Include/10.0.16299.0/ucrt\time.h function=main
line= device= threadid= num_gangs= num_workers= vector_length= grid= block= // 优化结果完全相同 Time with acc: ms
● 三重循环,无论仅使用外循环 loop、外中循环 loop,还是外中内循环 loop,获得的编译和运行结果都是相同的,只放上来一个进行讨论
#include <stdio.h>
#include <time.h>
#include <openacc.h> const int row = , col = , page = ; int main()
{
int i, j, k, a[row][col][page], b[row][col][page], c[row][col][page];
clock_t time;
for (i = ; i < row; i++)
{
for (j = ; j < col; j++)
{
for (k = ; k < page; k++)
a[i][j][k] = b[i][j][k] = i + j + k;
}
} #ifdef _OPENACC
time = clock();
#pragma acc parallel
#pragma acc loop
for (i = ; i < row; i++)
{
//#pragma acc loop
for (j = ; j < col; j++)
{
//#pragma acc loop
for (k = ; k<page; k++)
c[i][j][k] = a[i][j][k] + b[i][j][k];
}
}
time = clock() - time;
printf("\nTime with acc:%d ms\n", time);
#else
time = clock();
for (i = ; i < row; i++)
{
for (j = ; j < col; j++)
{
for (k = ; k<page; k++)
c[i][j][k] = a[i][j][k] + b[i][j][k];
}
}
time = clock() - time;
printf("\nTime without acc:%d ms\n", time);
#endif
getchar();
return ;
}
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop.exe
main:
, Accelerator kernel generated
Generating Tesla code
, #pragma acc loop gang /* blockIdx.x */ // 并行化了外层循环和内层循环,但是用中间层使用的是串行
, #pragma acc loop seq
, #pragma acc loop vector(128) /* threadIdx.x */
, Generating implicit copyout(c[:row][:col][:page])
Generating implicit copyin(b[:row][:col][:page],a[:row][:col][:page])
, Loop is parallelizable
, Loop is parallelizable D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop1.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits//Include/10.0.16299.0/ucrt\time.h function=main
line= device= threadid= num_gangs= num_workers= vector_length= grid= block= Time with acc: ms
最新文章
- js数组方法
- 【Java心得总结七】Java容器下——Map
- mysql 数据库故障通过备份恢复模拟
- SSM三大框架(转发)
- php 读取文件readfile
- JavaScript 中2个等号与3个等号的区别
- DIV重叠 如何优先显示(div浮在重叠的div上面)
- iOS--UISearchBar和UISearchDisplayController
- Knockout.Js官网学习(数组observable)
- eclipse,tomcat部署web项目,以及本地文件访问
- global关键字修改全局变量
- Python后台开发Django(会话控制)
- PHP curl Post请求和Get请求~
- Python的Django
- iOS事件拦截及应用
- Xgboost调参总结
- Spring Boot 揭秘与实战(七) 实用技术篇 - StateMachine 状态机机制
- 【CLR Via C#】15 枚举类型与位类型
- va_start(),va_end()函数应用【转】
- slideout