CUDA[4] sample program: matrix-vector multiplication

Use Compressed Sparse Row Format (CSR) to represent matrix
 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include "gputimer.h"

 #include<stdio.h>

 #include<stdlib.h>

 #include<string.h>

 #define WARP_SIZE 32

 __global__ void

 spmv_csr_vector_kernel ( const int num_rows ,

                          const int * ptr ,

                          const int * indices ,

                          const double * data ,

                          const double * x ,

                          double * y)

 {

     __shared__ double vals [WARP_SIZE];

     int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index

     int warp_id = thread_id / WARP_SIZE; // global warp index

     int lane = thread_id & (WARP_SIZE - ); // thread index within the warp

     // one warp per row

     int row = warp_id ;

     if ( row < num_rows )

     {

         int row_start = ptr [ row ];

         int row_end = ptr [ row +];

         // compute running sum per thread

         vals [ threadIdx.x ] = ;

         for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)

         vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];

         // parallel reduction in shared memory

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         // first thread writes the result

         if ( lane == )

         y[ row ] += vals [ threadIdx.x ];

     }

 }

 __global__ void

 spmv_csr_scalar_kernel ( const int num_rows ,

                          const int * ptr ,

                          const int * indices ,

                          const double * data ,

                          const double * x ,

                          double * y)

 {

     int row = blockDim.x * blockIdx.x + threadIdx.x ;

     if( row < num_rows )

     {

         double dot = ;

         int row_start = ptr [ row ];

         int row_end = ptr [ row +];

         for (int jj = row_start ; jj < row_end ; jj ++)

             dot += data [ jj ] * x[ indices [ jj ]];

         y[ row ] += dot ;

     }

 }

 int main(int argc,char **argv)

 {

     double h_data[]={,,,,,,,,};

     int h_col[]={,,,,,,,,};

     int h_ptr[]={,,,,};

     double h_x[]={,,,,};

     double h_y[]={,,,};

     int num_rows=;

     double *d_data;

     int *d_col;

     int *d_ptr;

     double *d_x;

     double *d_y;

     cudaMalloc((void**) &d_data,sizeof(double)*);

     cudaMalloc((void**) &d_col,sizeof(int)*);

     cudaMalloc((void**) &d_ptr,sizeof(int)*);

     cudaMalloc((void**) &d_x,sizeof(double)*);

     cudaMalloc((void**) &d_y,sizeof(double)*);

     cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*, cudaMemcpyHostToDevice);

     GpuTimer timer;

     timer.Start();

     spmv_csr_vector_kernel<<<num_rows,>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);

     //spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);

     timer.Stop();

     printf("Duration: %g ms\n",timer.Elapsed());

     cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*, cudaMemcpyDeviceToHost);

     for(int i=;i<num_rows;i++)

         printf("%.5f ",h_y[i]);

     printf("\n");

     return ;

 }
ref:
http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf　　
ch4.3
巴特西

CUDA[4] sample program: matrix-vector multiplication

最新文章

热门文章