编译选项: nvcc 4.cpp -o test_gemm  -lcudart -lcuda -lcublas -std=c++11

#include <sys/time.h>
#include <cuda_profiler_api.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdio.h> int8_t float2int8(float f, float scale) {
int8_t i = int8_t(f * scale);
if (i < -127) i = -127;
if (i > 127) i = 127;
return i;
} template <typename T, typename S>
void allocate_memory(int m, int n, int k, T **A, T **B, S **C) {
cudaMallocManaged(A, m * k * sizeof(T));
cudaMallocManaged(B, k * n * sizeof(T));
cudaMallocManaged(C, m * n * sizeof(S));
} template <typename T, typename S>
void free_memory(T *A, T *B, S *C) {
cudaFree(A);
cudaFree(B);
cudaFree(C);
} template <typename T, typename S>
int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transA, cublasOperation_t transB,
int m, int n, int k, T *A, T *B, S *C, int lda, int ldb, int ldc,
S *alpha, S *beta, int algo) {
cudaDataType_t AType, BType, CType, ComputeType;
if (std::is_same<T, float>::value) {
AType = BType = CType = ComputeType = CUDA_R_32F;
} else if (std::is_same<T, __half>::value) {
AType = BType = CType = ComputeType = CUDA_R_16F;
} else if (std::is_same<T, int8_t>::value) {
AType = BType = CUDA_R_8I;
CType = ComputeType = CUDA_R_32I;
} else {
printf("Not supported data type.");
return -1;
}
cublasStatus_t status;
status = cublasGemmEx(handle,
transA,
transB,
m,
n,
k,
alpha,
A,
AType,
lda,
B,
BType,
ldb,
beta,
C,
CType,
ldc,
ComputeType,
static_cast<cublasGemmAlgo_t>(algo)); if (status == CUBLAS_STATUS_SUCCESS)
return 1;
else
return -1;
} template <typename T, typename S>
void test_gemm(cublasHandle_t handle, int m, int n, int k, T *A, T *B, S *C,
S *alpha, S *beta, int algo, int iteration) {
float total_time = 0;
for (int i = 0; i < iteration; ++i) {
struct timeval start, end;
cudaDeviceSynchronize();
cudaProfilerStart();
gettimeofday(&start, NULL);
int success = cublas_gemm_ex(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
k,
B,
A,
C,
n,
k,
n,
alpha,
beta,
static_cast<cublasGemmAlgo_t>(algo));
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
cudaProfilerStop();
if (success > 0 && i > 0)
total_time += (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
}
if (total_time > 0)
printf("algo %d: %.3f ms\n", algo, total_time / (iteration - 1));
} int main() {
int m = 4096, n = 8192, k = 1024;
printf("shape: (%d, %d) x (%d, %d)\n", m, k, k, n);
int start_algo = CUBLAS_GEMM_DEFAULT;
int end_algo = CUBLAS_GEMM_ALGO23;
int start_algo_t_op = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
int end_algo_t_op = CUBLAS_GEMM_ALGO15_TENSOR_OP;
int iteration = 10; float *fA, *fB, *fC;
__half *hA, *hB, *hC;
int8_t *iA, *iB; int32_t *iC;
float f_alpha = 1, f_beta = 0;
__half h_alpha = __float2half_rn(1.0), h_beta = __float2half_rn(0.0);
int32_t i_alpha = 1, i_beta = 0;
allocate_memory(m, n, k, &fA, &fB, &fC);
allocate_memory(m, n, k, &hA, &hB, &hC);
allocate_memory(m, n, k, &iA, &iB, &iC);
for (int i = 0; i < m * k; ++i) {
fA[i] = float(i % 255 - 127) / 127;
hA[i] = __float2half_rn(fA[i]);
iA[i] = float2int8(fA[i], 127);
}
for (int i = 0; i < k * n; ++i) {
fB[i] = float(i % 255 - 127) / 127;
hB[i] = __float2half_rn(fB[i]);
iB[i] = float2int8(fB[i], 127);
}
cublasHandle_t handle;
cublasCreate(&handle); printf(">>>>>>>>>>>>>>>>> test fp32 >>>>>>>>>>>>>>>>>\n");
for (int algo = start_algo; algo <= end_algo; ++algo)
test_gemm(handle, m, n, k, fA, fB, fC, &f_alpha, &f_beta, algo, iteration);
for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)
test_gemm(handle, m, n, k, fA, fB, fC, &f_alpha, &f_beta, algo, iteration); printf(">>>>>>>>>>>>>>>>> test fp16 >>>>>>>>>>>>>>>>>\n");
for (int algo = start_algo; algo <= end_algo; ++algo)
test_gemm(handle, m, n, k, hA, hB, hC, &h_alpha, &h_beta, algo, iteration);
for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)
test_gemm(handle, m, n, k, hA, hB, hC, &h_alpha, &h_beta, algo, iteration); printf(">>>>>>>>>>>>>>>>> test int8 >>>>>>>>>>>>>>>>>\n");
for (int algo = start_algo; algo <= end_algo; ++algo)
test_gemm(handle, m, n, k, iA, iB, iC, &i_alpha, &i_beta, algo, iteration);
for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)
test_gemm(handle, m, n, k, iA, iB, iC, &i_alpha, &i_beta, algo, iteration); printf(">>>>>>>>>>>>>>>>> compare result >>>>>>>>>>>>>>>>>\n");
printf("fp32: ");
for (int i = 0; i < 10; ++i)
printf("%.5f%c", fC[i], " \n"[i==9]);
printf("fp16: ");
for (int i = 0; i < 10; ++i)
printf("%.5f%c", float(hC[i]), " \n"[i==9]);
printf("int8: ");
for (int i = 0; i < 10; ++i)
printf("%.5f%c", float(iC[i])/127/127, " \n"[i==9]); free_memory(iA, iB, iC);
free_memory(fA, fB, fC);
free_memory(hA, hB, hC);
return 0;
}

最新文章

  1. 启动OracleDBConsoleorcl失败,提示错误代码2
  2. 用avalon实现一个完整的todomvc(带router)
  3. 7 天玩转 ASP.NET MVC — 第 4 天
  4. 1742. Team building(dfs)
  5. 时间处理得到UTC时间
  6. 布局神器:Flexbox
  7. NX-bridge,可以实现无线XBee控制的Arduino板
  8. nyoj 破门锁(水题)
  9. [数分提高]2014-2015-2第6教学周第1次课讲义 3.3 Taylor 公式
  10. MacOS下命令行安装神器brew
  11. jenkins 插件介绍
  12. zprofiler三板斧解决cpu占用率过高问题(转载)
  13. [转] Android开发之如何保证Service不被杀掉(broadcast+system/app)
  14. 【学习笔记】cache/buffer
  15. python自动化运维之路~DAY8
  16. Python logging 模块学习
  17. Spring4 MVC HelloWord实例
  18. 配置 struts2 时掉进 web.xml 的坑
  19. 转换CLOB字段类型为VARCHAR2, lob类型不支持的sql语句
  20. BZOJ 3160 FFT+马拉车

热门文章

  1. ERROR 2003 (HY000): Can&#39;t connect to MySQL server on &#39;localhost:3306&#39; (10061)
  2. [数据库-Mongo总结]-mysql使用总结和pymysql交互
  3. Python中的__new__()方法
  4. JAVA丑数
  5. git -----已经被跟踪文件如何在本地提交时忽略
  6. html 1.0
  7. 转发-》c++ stl multimap基本操作使用技巧详细介绍
  8. Python_基础_Print_转义字符和原字符
  9. drools规则的入门使用
  10. SignalR+Redis,SignalR+Sqlserver集群部署应对海量链接