// 调用CUDA kernel 是非阻塞的,调用kernel语句后面的语句不等待kernel执行完,立即执行。所以在 call_kernel(see kernel.cu) 中执行 m5op.dump 是错误的!!!

// REF: https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Measuring_kernel_runtime

// cudaThreadSynchronize() 暂停调用者的执行,直到前面的 stream operation 执行完毕。

// REF: https://stackoverflow.com/questions/13485018/cudastreamsynchronize-vs-cudadevicesynchronize-vs-cudathreadsynchronize

// C++ thread join 问题,在 kernel.cpp 中也有 join,那么是在 kernel.cpp 中 dump 还是在main.cpp中join后面dump?

// REF: http://en.cppreference.com/w/cpp/algorithm/for_each

// 若 GPU 先执行完毕,在 main.cpp 中join后 dump 似乎合理; 若 CPU 先执行完毕,岂不是要阻塞在 cudaThreadSynchronize 处?

// 暂且在 kernel.cp p中 dump!

kernel.cpp

// CPU threads--------------------------------------------------------------------------------------
void run_cpu_threads(T *matrix_out, T *matrix, std::atomic_int *flags, int n, int m, int pad, int n_threads, int ldim, int n_tasks, float alpha
#ifdef CUDA_8_0
, std::atomic_int *worklist
#endif
) {
std::cout<<"run_cpu_threads start."<<std::endl; const int REGS_CPU = REGS * ldim;
std::vector<std::thread> cpu_threads;
for(int i = ; i < n_threads; i++) { cpu_threads.push_back(std::thread([=]() { #ifdef CUDA_8_0
Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads, worklist);
#else
Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads);
#endif const int matrix_size = m * (n + pad);
const int matrix_size_align = (matrix_size + ldim * REGS - ) / (ldim * REGS) * (ldim * REGS); for(int my_s = cpu_first(&p); cpu_more(&p); my_s = cpu_next(&p)) { // Declare on-chip memory
T reg[REGS_CPU];
int pos = matrix_size_align - - (my_s * REGS_CPU);
int my_s_row = pos / (n + pad);
int my_x = pos % (n + pad);
int pos2 = my_s_row * n + my_x;
// Load in on-chip memory
#pragma unroll
for(int j = ; j < REGS_CPU; j++) {
if(pos2 >= && my_x < n && pos2 < matrix_size)
reg[j] = matrix[pos2];
else
reg[j] = ;
pos--;
my_s_row = pos / (n + pad);
my_x = pos % (n + pad);
pos2 = my_s_row * n + my_x;
} // Set global synch
while((&flags[my_s])->load() == ) {
}
(&flags[my_s + ])->fetch_add(); // Store to global memory
pos = matrix_size_align - - (my_s * REGS_CPU);
#pragma unroll
for(int j = ; j < REGS_CPU; j++) {
if(pos >= && pos < matrix_size)
matrix_out[pos] = reg[j];
pos--;
}
}
}));
}
std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); });
std::cout<<"dump.. after run_cpu_threads end."<<std::endl;
m5_dump_stats(,);
}

kernel.cu

cudaError_t call_Padding_kernel(int blocks, int threads, int n, int m, int pad, int n_tasks, float alpha,
T *matrix_out, T *matrix, int *flags
#ifdef CUDA_8_0
, int l_mem_size, int *worklist
#endif
){
std::cout<<"call_pad start."<<std::endl;
dim3 dimGrid(blocks);
dim3 dimBlock(threads);
Padding_kernel<<<dimGrid, dimBlock
#ifdef CUDA_8_0
, l_mem_size
#endif
>>>(n, m, pad, n_tasks, alpha,
matrix_out, matrix, flags
#ifdef CUDA_8_0
, worklist
#endif
);
cudaError_t err = cudaGetLastError();
std::cout<<"dump.. after call_pad end."<<std::endl;
m5_dump_stats(,);
return err;
}

main.cpp

for(int rep = ; rep < p.n_warmup + p.n_reps; rep++) {

        // Reset
#ifdef CUDA_8_0
for(int i = ; i < p.n_bins; i++) {
h_histo[i].store();
}
#else
memset(h_histo, , p.n_bins * sizeof(unsigned int));
cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaThreadSynchronize();
CUDA_ERR();
#endif std::cout<<"m5 work begin."<<std::endl; // Launch GPU threads
// Kernel launch
if(p.n_gpu_blocks > ) {
std::cout<<"launch gpu."<<std::endl;
cudaStatus = call_Histogram_kernel(p.n_gpu_blocks, p.n_gpu_threads, p.in_size, p.n_bins, n_cpu_bins,
d_in, (unsigned int*)d_histo, p.n_bins * sizeof(unsigned int));
CUDA_ERR();
} // Launch CPU threads
std::cout<<"launch cpu."<<std::endl;
std::thread main_thread(run_cpu_threads, (unsigned int *)h_histo, h_in, p.in_size, p.n_bins, p.n_threads,
p.n_gpu_threads, n_cpu_bins);
std::cout<<"cuda sync."<<std::endl; cudaThreadSynchronize();
std::cout<<"cpu join after cuda sync."<<std::endl;
main_thread.join(); //m5_work_end(0, 0);
std::cout<<"m5 work end."<<std::endl;
}

最新文章

  1. Jquery实现静态切换tab
  2. linux命令分享(四):iostat
  3. requests模块--python发送http请求
  4. java中static作用详解
  5. BNU 2418 Ultra-QuickSort (线段树求逆序对)
  6. 【COGS &amp; USACO Training】710. 命名那个数字(hash+水题+dfs)
  7. 关于EOF和循环体的搭配使用。
  8. 如何使用yum来下载RPM包而不进行安装
  9. python image模块
  10. android 双向滑动 seekbar
  11. HOG detectMultiScale 参数分析
  12. Linux Apache绑定多域名
  13. SQL Server :理解IAM 页
  14. Ubuntu14.04安装配置Chrome浏览器
  15. 五、Java多人博客系统-2.0版本-数据库设计
  16. Application Security Per-Engagement
  17. rabbit初学之连接测试2
  18. job.yml
  19. git在不同平台windows、linux、mac 上换行符的问题
  20. 负载均衡下 tomcat session 共享

热门文章

  1. [多校联考]SLON!!!
  2. Wepy框架和mpVue框架的比较及使用mpVue框架需要注意的点
  3. alerm和pause
  4. linux 查看Apache Tomcat日志访问IP前10
  5. docker的概念
  6. SC.Lab3对于Factory的构建过程(from HIT)
  7. window.onload 方法脚本
  8. 《C++ Primer(中文版)(第5版)》斯坦利&#183;李普曼 (Stanley B. Lippman) (作者), 约瑟&#183;拉乔伊 (Josee Lajoie) (作者), 芭芭拉&#183;默 (Barbara E. Moo) (作者) azw3
  9. 前端学习笔记系列一:3 Vue中的nextTick
  10. 1-8SpringBoot之切面AOP