使用CUDA的线性代数库cuBLAS来计算矩阵乘法。这里主要记录调用规则,关于乘法函数中详细的参数说明和调用规则见另一篇随笔。

▶ 源代码:

 #include <assert.h>
#include <helper_string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_functions.h>
#include <helper_cuda.h> #ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif // 存放各矩阵维数的结构体
typedef struct _matrixSize
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize; // CPU 计算矩阵乘法。三个参数分别用于行定位、行程定位、列定位,没有查错机制。
void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = ; i < hA; ++i) // 从上往下数 i 行
{
for (unsigned int j = ; j < wB; ++j) // 从左往右数 j 列
{
double sum = ;
for (unsigned int k = ; k < wA; ++k) // 行程长
{
double a = A[i * wA + k]; // 中间过程用 double,结果输出 float
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
} // 初始化数组
void randomInit(float *data, int size)
{
for (int i = ; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
} //输出两个矩阵的不相等的值及其位置,允许容差为 fListTol ,最多输出 iListLength 个
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
int i, j, k;
int error_count = ; for (j = ; j < height; j++)
{
if (error_count < iListLength)
printf("\n Row %d:\n", j); for (i = ; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
error_count++;
}
}
}
printf(" \n Total Errors = %d\n", error_count);
} // 初始化设备。包括选择设备,设定维数
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
cudaError_t error; // 选择设备,略去了检查错误部分
devID = ;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
}
error = cudaGetDevice(&devID);
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
iSizeMultiple = max(min(iSizeMultiple, ), );
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // Fermi 以上构架的计算机使用更大的线程块(blockDim),这里用了32
int block_size = (deviceProp.major < ) ? : ; matrix_size.uiWA = * block_size * iSizeMultiple;
matrix_size.uiHA = * block_size * iSizeMultiple;
matrix_size.uiWB = * block_size * iSizeMultiple;
matrix_size.uiHB = * block_size * iSizeMultiple;
matrix_size.uiWC = * block_size * iSizeMultiple;
matrix_size.uiHC = * block_size * iSizeMultiple; printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, matrix_size.uiWC);
if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC || matrix_size.uiWB != matrix_size.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(-);
}
} // 计算矩阵乘法部分
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, devID);
int block_size = (deviceProp.major < ) ? : ; unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B); srand();
randomInit(h_A, size_A);
randomInit(h_B, size_B); float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
//float *h_C = (float *) malloc(mem_size_C); // TM 没有用!
float *h_CUBLAS = (float *) malloc(mem_size_C); // 保存 d_C 回传的结果
cudaMalloc((void **) &d_A, mem_size_A);
cudaMalloc((void **) &d_B, mem_size_B);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_C, mem_size_C); dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); printf("Computing result using CUBLAS...");
int nIter = ; //cuBLAS代码块
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop; cublasCreate(&handle); //热身,注意转置的问题,不采用 <<< >>> 调用核函数
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord(start, NULL); for (int j = ; j < nIter; j++)
{
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
}
printf("done.\n");
cudaEventRecord(stop, NULL);
cudaEventSynchronize(stop);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop); // 计算了耗时、操作数以及操作速度
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
cublasDestroy(handle);
} printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n"); bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true)
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, , 1.0e-5f); printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); free(h_A);
free(h_B);
free(h_CUBLAS);
free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C); if (resCUBLAS == true)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
} int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n"); int devID = , sizeMult = ;
sMatrixSize matrix_size; initializeCUDA(argc, argv, devID, sizeMult, matrix_size); int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); getchar();
return matrix_result;
}

▶ 输出结果:

[Matrix Multiply CUBLAS] - Starting...
GPU Device : "GeForce GTX 1070" with compute capability 6.1 MatrixA(,), MatrixB(,), MatrixC(,)
Computing result using CUBLAS...done.
Performance= 2887.08 GFlop/s, Time= 0.068 msec, Size= Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势:

● 代码依然很烂。
多用了一个 h_C 根本没有用上,其作用被 h_CUBLAS 取代了,而且源代码中有free(h_C)却没有free(h_CUBLAS)。

 float *h_C = (float *)malloc(mem_size_C);
...
free(h_C);

● 句柄的创造与销毁,定义于cublas_api.h 中

 /*cublas_api.h*/
typedef struct cublasContext *cublasHandle_t; /*cublas_v2.h*/
#define cublasCreate cublasCreate_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle); #define cublasDestroy cublasDestroy_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);

● 矩阵乘法计算核心函数。实际上该函数不是用来专门计算矩阵乘法的,而且对应不同的数据类型(实数、复数)和数据精度(单精度、双精度)一共有四个函数。

 #define cublasSgemm cublasSgemm_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2
(
cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc
);

● 定义在 helper_image.h 中的一个函数,用于比较两个长为 len 数组是否相等,允许容差为epsilon

 inline bool sdkCompareL2fe(const float *reference, const float *data, const unsigned int len, const float epsilon)

● 调用cublas计算矩阵乘法的过程摘要(详细的参数说明和调用规则见另一篇随笔)

 ...// 准备d_A,d_B,d_C,其中 d_A 和 d_B 中存放了需要相乘的两个矩阵,d_C初始化自动为零矩阵

 // 规定使用的线程块和线程尺寸
dim3 threads(, );
dim3 grid(, ); // 常数因子,计算 d_C = d_A * d_B 时设定为 α = 1.0, β = 0.0
const float alpha = 1.0f;
const float beta = 0.0f; // 创建句柄,需要在计算完成后销毁
cublasHandle_t handle;
cublasCreate(&handle); // 调用计算函数,注意参数顺序
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WB, HA, WA, &alpha, d_B, WB, d_A, WA, &beta, d_C, WB); cublasDestroy(handle); ...// 回收计算结果,顺序可以和销毁句柄交换

最新文章

  1. 玩转Windows服务系列——Debug、Release版本的注册和卸载,及其原理
  2. form表单 ----在路上(15)
  3. IOS开发之Bug--iOS7View被导航栏遮挡问题的解决
  4. 现在web前端这么火,钱景怎么样啊?
  5. 直播CDN架构随想
  6. git Please move or remove them before you can merge. 错误解决方案
  7. intall vs code in elementary os
  8. How Tomcat Works(十五)
  9. Android在一个APP中通过包名或类名启动另一个APP
  10. 【转】JDBC学习笔记(8)——数据库连接池(dbcp&amp;C3P0)
  11. 第二次项目冲刺(Beta阶段)第一天
  12. uml类图关系
  13. CI框架在模型中切换读写库和读写库
  14. Java编程基础篇第三章
  15. TensorFlow --playground游乐场
  16. linux操作小技巧锦集
  17. Winform异步解决窗体耗时操作(Action专门用于无返回值,Func专门用于有返回值)
  18. MyEclipse下自定义(支持html5的)JSP模板--JSP
  19. 并发编程之 AQS 源码剖析
  20. Linux驱动程序:统计单词个数

热门文章

  1. css左右布局的几种实现方式和优缺点
  2. 第一个asp.net MVC5+ExtJS6入门案例项目
  3. vue学习之父组件与子组件之间的交互
  4. fitnesse - 一个简单的例子(slim)
  5. HDU3336 Count the string
  6. Java高新技术 Myeclipse 介绍
  7. MVC中Controller控制器相关技术
  8. c#使用GDI+简单绘图(二)
  9. python学习记录-socket模块
  10. 完美实现身份证校验 js正则