0_Simple__UnifiedMemoryStreams

使用 OpenMP 和 pthreads 两种环境，利用实现统一内存编址，计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

 #include <cstdio>

 #include <vector>

 #include <algorithm>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <cublas_v2.h>

 //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS

 #ifdef USE_PTHREADS

     #include <pthread.h>

     #pragma comment(lib, "pthreadVC2.lib")

 #else

     #include <omp.h>

 #endif

 // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

 void srand48(long seed) { srand((unsigned int)seed); }

 double drand48() { return double(rand()) / RAND_MAX; }

 #endif

 template <typename T> struct Task// struct 也可使用类的构造和析构函数

 {

     unsigned int size, id;

     T *data;

     T *result;

     T *vector;

     Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};

     Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)

     {

         cudaMallocManaged(&data, sizeof(T)*size*size);

         cudaMallocManaged(&result, sizeof(T)*size);

         cudaMallocManaged(&vector, sizeof(T)*size);

         cudaDeviceSynchronize();

     }

     ~Task()

     {

         cudaDeviceSynchronize();

         cudaFree(data);

         cudaFree(result);

         cudaFree(vector);

     }

     void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存，初始化各成员数组

     {

         id = unique_id;

         size = s;

         cudaMallocManaged(&data, sizeof(T)*size*size);

         cudaMallocManaged(&result, sizeof(T)*size);

         cudaMallocManaged(&vector, sizeof(T)*size);

         cudaDeviceSynchronize();

         for (int i = ; i < size*size; i++)

             data[i] = drand48();

         for (int i = ; i < size; i++)

         {

             result[i] = .;

             vector[i] = drand48();

         }

     }

 };

 #ifdef USE_PTHREADS// 封装 pthread 型的任务

 struct threadData_t

 {

     int tid;

     Task<double> *TaskListPtr;

     cudaStream_t *streams;

     cublasHandle_t *handles;

     int taskSize;

 };

 typedef struct threadData_t threadData;

 #endif

 template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result

 {

     for (int i = ; i < m; i++)// 源代码这写成了 n，并且漏掉了后面的 alpha

     {

         result[i] *= *beta;

         for (int j = ; j < n; j++)

             result[i] += *alpha * A[i*n + j] * x[j];

     }

 }

 // execute a single task on either host or device depending on size

 #ifdef USE_PTHREADS

 void * execute(void* inpArgs)

 {

     threadData *dataPtr    = (threadData *) inpArgs;

     cudaStream_t *stream   = dataPtr->streams;

     cublasHandle_t *handle = dataPtr->handles;

     int tid                = dataPtr->tid;

     for (int i = ; i < dataPtr->taskSize; i++)

     {

         Task<double>  &t = dataPtr->TaskListPtr[i];

         double alpha = 1.0;

         double beta = 0.0;

         if (t.size < )// 数据规模较小在主机上运行，否则在设备上运行

         {

             printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);

             cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);

             cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);

             cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);

             cudaStreamSynchronize(stream[]);

             gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);

         }

         else

         {

             printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);

             cublasSetStream(handle[tid+], stream[tid+]);

             cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);

             cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);

             cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);

             cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );

         }

     }

     return NULL;

 }

 #else

 template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)

 {

     double alpha = 1.0;

     double beta = 0.0;

     if (t.size < )// 数据规模较小在主机上运行，否则在设备上运行

     {

         printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);

         cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);

         cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);

         cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);

         cudaStreamSynchronize(stream[]);

         gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);

     }

     else

     {

         printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);

         cublasSetStream(handle[tid+], stream[tid+]);

         cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);

         cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);

         cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);

         cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );

     }

 }

 #endif

 template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)

 {

     for (unsigned int i = ; i < TaskList.size(); i++)

     {

         int size;

         size = std::max((int)(drand48()*1000.0), );

         TaskList[i].allocate(size, i);

     }

 }

 int main()

 {

     printf("\n\tStart.\n");

     cudaDeviceProp device_prop;

     cudaGetDeviceProperties(&device_prop, );

     if (!device_prop.managedMemory)

     {

         printf("\n\tUnified Memory not supported\n");

         getchar();

         return ;

     }

     if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式

     {

         printf("\n\tComputeMode is cudaComputeModeProhibited\n");

         getchar();

         return ;

     }

     srand48(time(NULL));

     const int nthreads = ;

     cudaStream_t *streams = new cudaStream_t[nthreads+];

     cublasHandle_t *handles = new cublasHandle_t[nthreads+];

     for (int i=; i<nthreads+; i++)

     {

         cudaStreamCreate(&streams[i]);

         cublasCreate(&handles[i]);

     }

     unsigned int N = ;

     std::vector<Task<double> > TaskList(N);

     initialise_tasks(TaskList);

     cudaSetDevice();

 #ifdef USE_PTHREADS

     pthread_t threads[nthreads];

     threadData *InputToThreads = new threadData[nthreads];

     int temp = TaskList.size() / nthreads;

     for (int i=; i < nthreads; i++)

     {

         InputToThreads[i].tid = i;

         InputToThreads[i].streams = streams;

         InputToThreads[i].handles = handles;

         if (temp == )  // 任务数量比线程数少

         {

             InputToThreads[i].taskSize = ;

             InputToThreads[i].TaskListPtr = &TaskList[];

         }

         else            // 任务数量不少于线程数。任务尽量均分，多出的零头全部塞给最后一个线程

         {

             if (i == nthreads - )

             {

                 InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);

                 InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];

             }

             else

             {

                 InputToThreads[i].taskSize = temp;

                 InputToThreads[i].TaskListPtr = &TaskList[i*temp];

             }

         }

         pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);

     }

     for (int i=; i < nthreads; i++)

         pthread_join(threads[i], NULL);

 #else

     omp_set_num_threads(nthreads);

     #pragma omp parallel for schedule(dynamic)

     for (int i=; i<TaskList.size(); i++)

     {

         int tid = omp_get_thread_num();

         execute(TaskList[i], handles, streams, tid);

     }

 #endif

     cudaDeviceSynchronize();

     // 清理工作

     for (int i=; i<nthreads+; i++)

     {

         cudaStreamDestroy(streams[i]);

         cublasDestroy(handles[i]);

     }

     std::vector< Task<double> >().swap(TaskList);

     printf("\n\tFinish.\n");

     getchar();

     return ;

 }

▶ 输出结果：OpenMP

    Start.

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

    Finish.

▶ 输出结果：pthreads

    Start.

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [  ], on host

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [  ], on host

Task [ ], thread [ ], size [  ], on host

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [  ], on host

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

    Finish.

▶ 涨姿势：

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库，注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

 // driver_types.h

 #define cudaMemAttachGlobal 0x01  // 可访问内存

 #define cudaMemAttachHost   0x02  // 不可访问内存

 #define cudaMemAttachSingle 0x04  // 单线程可访问内存

 // cuda_runtime.h

 template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)

 {

     return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);

 }

 // cuda_runtime_api.h

 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));

巴特西

0_Simple__UnifiedMemoryStreams

最新文章

热门文章