Use Compressed Sparse Row Format (CSR) to represent matrix

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "gputimer.h"
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define WARP_SIZE 32 __global__ void
spmv_csr_vector_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
__shared__ double vals [WARP_SIZE];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index
int warp_id = thread_id / WARP_SIZE; // global warp index
int lane = thread_id & (WARP_SIZE - ); // thread index within the warp
// one warp per row
int row = warp_id ;
if ( row < num_rows )
{
int row_start = ptr [ row ];
int row_end = ptr [ row +];
// compute running sum per thread
vals [ threadIdx.x ] = ;
for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)
vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];
// parallel reduction in shared memory
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
// first thread writes the result
if ( lane == )
y[ row ] += vals [ threadIdx.x ];
}
} __global__ void
spmv_csr_scalar_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
int row = blockDim.x * blockIdx.x + threadIdx.x ;
if( row < num_rows )
{
double dot = ;
int row_start = ptr [ row ];
int row_end = ptr [ row +];
for (int jj = row_start ; jj < row_end ; jj ++)
dot += data [ jj ] * x[ indices [ jj ]];
y[ row ] += dot ;
}
} int main(int argc,char **argv)
{
double h_data[]={,,,,,,,,};
int h_col[]={,,,,,,,,};
int h_ptr[]={,,,,};
double h_x[]={,,,,};
double h_y[]={,,,};
int num_rows=; double *d_data;
int *d_col;
int *d_ptr;
double *d_x;
double *d_y; cudaMalloc((void**) &d_data,sizeof(double)*);
cudaMalloc((void**) &d_col,sizeof(int)*);
cudaMalloc((void**) &d_ptr,sizeof(int)*);
cudaMalloc((void**) &d_x,sizeof(double)*);
cudaMalloc((void**) &d_y,sizeof(double)*);
cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*, cudaMemcpyHostToDevice); GpuTimer timer;
timer.Start();
spmv_csr_vector_kernel<<<num_rows,>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
//spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
timer.Stop();
printf("Duration: %g ms\n",timer.Elapsed()); cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*, cudaMemcpyDeviceToHost); for(int i=;i<num_rows;i++)
printf("%.5f ",h_y[i]);
printf("\n"); return ;
}

ref:

http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf  

ch4.3

最新文章

  1. NOIP2003pj栈[卡特兰数]
  2. H2数据库攻略
  3. HDU1269 迷宫城堡
  4. Apache Shiro 使用手册(四)Realm 实现
  5. SharedPreference注册OnSharedPreferenceChangeListener一直无法回调问题
  6. OpenSSH后门获取root密码及防范
  7. [转载]char * 和char []的区别---之第一篇
  8. C#中的Collection 1
  9. [转]LoadRunner脚本录制常见问题整理
  10. 仿腾讯课堂固定滚动列表ReactNative组件
  11. fab 菜单实现—圆形、半圆、扇形、直线、射线
  12. [Swift]LeetCode109. 有序链表转换二叉搜索树 | Convert Sorted List to Binary Search Tree
  13. Linux---设备文件名和挂载点
  14. BootstrapValidator 解决多属性被同时校验问题
  15. php中wampserver多站点配置
  16. android.support不统一的问题
  17. Linux系统 vi/vim文本编辑器
  18. ORA-03001,GATHER_TABLE_STATS数据库自动收集统计信息报错
  19. (转)Groupon前传:从10个月的失败作品修改,1个月找到成功 并不挶泥在这个点子上面,它反而往后站一步,看看他们已经做好的这个网站,可以再怎么包装成另一个完完全全不同的网站?所有的人所做的每件失败的事情中, 一定有碰到或含有成功的答案」在里面,只是他们不知道而已。 人不怕失败」,只怕宣布失败」
  20. Atitit &#160;jdbc 处理返回多个结果集

热门文章

  1. Node安装及自定义config
  2. linux拷贝文件夹cp
  3. 团队第一次 # scrum meeting
  4. 字符串格式化:f-strings
  5. Solr——配置IK分词器
  6. cxf+spring+soap简单接口开发
  7. Vue proxy
  8. PHP json_encode 文本形式数字下标数组导致下标丢失
  9. netbeans 正则替换
  10. jQuery自定义alert,confirm方法及样式