直接放代码了。。。

实现的是x1+x2=y的预测,但梯度下降很慢。。。233333,gpu运行时间很快!!

//
// main.cpp
// bp
//
// Created by jzc on 2018/4/18.
// Copyright © 2018年 jzc. All rights reserved.
//
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <fstream>
#include <cuda_runtime.h>
using namespace std;
#define DATASIZE 10000
#define TESTSIZE 100
#define NEURESIZE 50
#define RW 0.1
#define EPOCH 1000
#define E 2.71828
//打印设备信息
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %ld.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %ld.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %ld.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[], prop.maxThreadsDim[], prop.maxThreadsDim[]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[], prop.maxGridSize[], prop.maxGridSize[]);
printf("totalConstMem : %ld.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %ld.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
} //CUDA 初始化
bool InitCUDA()
{
int count; //取得支持Cuda的装置的数目
cudaGetDeviceCount(&count); if (count == ) {
fprintf(stderr, "There is no device.\n");
return false;
} int i; for (i = ; i < count; i++) { cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
//打印设备信息
printDeviceProp(prop); if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= ) {
break;
}
}
} if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
} cudaSetDevice(i); return true;
}
void init(int num,int range,double a[],double offset){
for(int i=;i<num;i++){
a[i] = (double)(rand()%(range*)/1000.0) - offset;
}
} void getM(int num,double a[],double m[]){
m[] = m[] = 0.0;
for(int i=;i<num;i++){
if(a[i]<m[]){
m[] = a[i];
}else if(a[i]>m[]){
m[] = a[i];
}
}
} void normalize(int num,double a[],double m[]){
for(int i =;i<num;i++){
a[i] = (a[i]-m[]+)/(m[]-m[]+);
}
} void renorm(int num,double a[],double m[]){
for(int i =;i<num;i++){
a[i] = a[i]*(m[]-m[]+) + m[] - ;
}
} void printArray(int num,double a[]){
for(int i=;i<num;i++){
printf("%6.4lf ",a[i]);
if((i+)%==){
cout<<endl;
}
}
} __global__ static void hidenLayer(double x1,double x2,double w1[],double w2[],double yh[]){
/*for(int i=0;i<NEURESIZE;i++){
yh[i] = w1[i]*x1 + w2[i]*x2;
yh[i] = 1/(1+pow(E,0-yh[i]));
}*/
const int tid = threadIdx.x;
int i =tid;
yh[i] = w1[i]*x1 + w2[i]*x2;
yh[i] = /(+pow(E,-yh[i]));
} double outLayer(double yh[],double v[]){
double y2;
for(int i=;i<NEURESIZE;i++){
y2 += yh[i] * v[i];
}
y2 = /(+pow(E,-y2));
return y2; } __global__ static void update(double x1[],double x2[],double yh[],double v[],double w1[],double w2[],double *loss){
const int tid = threadIdx.x;
int i = tid;
/*for(int i=0;i<NEURESIZE;i++){
w1[i] += x1[i] * (1-x1[i]) * loss * RW;
w2[i] += x2[i] * (1-x2[i]) * loss * RW;
v[i] += yh[i] * loss * RW;
}*/
w1[i] += x1[i] * (-x1[i]) * (*loss) * RW;
w2[i] += x2[i] * (-x2[i]) * (*loss) * RW;
v[i] += yh[i] * (*loss) * RW;
} /*double test(double w1[],double w2[],double v[],double m1[],double m2[],double my[]){
double tx1[TESTSIZE],tx2[TESTSIZE],ty[TESTSIZE],tyh[NEURESIZE],ty2[TESTSIZE];
double avLoss = 0.0; init(TESTSIZE,10,tx1,0.0);
init(TESTSIZE,10,tx2,0.0); for(int i=0;i<TESTSIZE;i++){
ty[i] = tx1[i] + tx2[i];
}
normalize(TESTSIZE,tx1,m1);
normalize(TESTSIZE,tx2,m2);
for(int q=0;q<TESTSIZE;q++){
hidenLayer(tx1[q],tx2[q],w1,w2,tyh);
ty2[q] = outLayer(tyh,v);
} renorm(TESTSIZE,ty2,my);
for(int i=0;i<TESTSIZE;i++){
if(i<10){
printf("%2d y=%2.4f y2=%2.4f\n",i,ty[i],ty2[i]);
}
avLoss += pow(ty[i]-ty2[i],2);
}
avLoss /= TESTSIZE;
//cout<<avLoss<<endl;
return avLoss;
}*/ int main(){
ofstream outf;
outf.open("trainloss.txt");
srand( (unsigned)time(NULL) );
long starttime = clock();
double x1[DATASIZE],x2[DATASIZE],y[DATASIZE],y2[DATASIZE];
double w1[NEURESIZE],w2[NEURESIZE],v[NEURESIZE],yh[NEURESIZE];
double m1[],m2[],my[];
double cLoss,realLoss,minTrainLoss = 1.0,minTestLoss = 1.0;
init(DATASIZE,,x1,0.0);
init(DATASIZE,,x2,0.0);
init(NEURESIZE,,w1,1.0);
init(NEURESIZE,,w2,1.0);
init(NEURESIZE,,v,1.0); for(int i=;i<DATASIZE;i++){
y[i] = x1[i] + x2[i];
} //CUDA 初始化
if (!InitCUDA()) {
return ;
}
//cudaMalloc 取得一块显卡内存
double *x1_g,*x2_g,*y_g,*y2_g;
double *w1_g,*w2_g,*v_g,*yh_g;
double *cLoss_g;
cudaMalloc((void**)&x1_g, sizeof(double)* DATASIZE);
cudaMalloc((void**)&x2_g, sizeof(double)* DATASIZE);
cudaMalloc((void**)&y_g, sizeof(double)* DATASIZE);
cudaMalloc((void**)&y2_g, sizeof(double)* DATASIZE);
cudaMalloc((void**)&w1_g, sizeof(double)* NEURESIZE);
cudaMalloc((void**)&w2_g, sizeof(double)* NEURESIZE);
cudaMalloc((void**)&v_g, sizeof(double)* NEURESIZE);
cudaMalloc((void**)&yh_g, sizeof(double)* NEURESIZE);
cudaMalloc((void**)&cLoss_g, sizeof(double)); //cudaMemcpy 将产生的随机数复制到显卡内存中
//cudaMemcpyHostToDevice - 从内存复制到显卡内存
//cudaMemcpyDeviceToHost - 从显卡内存复制到内存
cudaMemcpy(w1_g,w1, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
cudaMemcpy(w2_g,w2, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
cudaMemcpy(v_g,v, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
cudaMemcpy(x1_g,x1, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
cudaMemcpy(x2_g,x2, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
cudaMemcpy(y_g,y, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
cudaMemcpy(yh_g,yh, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice); getM(DATASIZE,x1,m1);
getM(DATASIZE,x2,m2);
getM(DATASIZE,y,my);
normalize(DATASIZE,x1,m1);
normalize(DATASIZE,x2,m2);
normalize(DATASIZE,y,my); for(int j=;j<EPOCH;j++){
double tLoss = 0.0;
for(int i=;i<DATASIZE;i++){
hidenLayer<< < , NEURESIZE, >> >(x1_g[i],x2_g[i],w1_g,w2_g,yh_g);
cudaMemcpy(yh,yh_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
y2[i] = outLayer(yh,v);
cLoss = y2[i] * (-y2[i]) * (y[i]-y2[i]);
cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice);
update<< < , NEURESIZE, >> >(x1_g,x2_g,yh_g,v_g,w1_g,w2_g,cLoss_g);
cudaMemcpy(&cLoss,cLoss_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
cLoss = pow(cLoss,);
cLoss = cLoss*(my[]-my[]+);
tLoss += cLoss;
}
tLoss /= DATASIZE;
if(tLoss<minTrainLoss){
minTrainLoss = tLoss;
}
printf("EPOCH--%d, trainLoss--%0.4f\n",j,tLoss);
outf<<j<<"\t"<<tLoss<<endl; /*cudaMemcpy(w1,w1_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(w2,w2_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
double avLoss = test(w1,w2,v,m1,m2,my);
printf("EPOCH--%d, avLoss--%0.4f\n",j,avLoss);
if(avLoss<minTestLoss){
minTestLoss = avLoss;
}*/
cout<<"------------------"<<endl;
}
printf("minTrainLoss--%0.4f\n",minTrainLoss);
//printf("minTestLoss--%0.4f\n",minTestLoss);
outf.close(); //Free
cudaFree(x1_g);
cudaFree(x2_g);
cudaFree(y_g);
cudaFree(w1_g);
cudaFree(w2_g);
cudaFree(v_g);
cudaFree(yh_g);
cudaFree(cLoss_g); long endtime = clock()-starttime;
float execution_time = (float)endtime / ( * );
cout << "total time cost: " << execution_time<<endl; return ;
}

最新文章

  1. c++ 顺序容器适配器
  2. iOS报错笔记
  3. Mysql数据库表排序规则不一致导致联表查询,索引不起作用问题
  4. UML系列图2
  5. 笔记:Hibernate 拦截器和事件
  6. 基于GIS的视频管理指挥平台
  7. 记一次Debug过程
  8. day2作业
  9. 版本控制--git+idea
  10. Coursera Deep Learning 3 Structuring Machine Learning Projects, ML Strategy
  11. [IR] Extraction-based Text Summarization
  12. idea tomcat控制台system.out.println是乱码
  13. 一般化数值算法(accumluate,inner_product,partial_sum,adjacent_difference)
  14. js copy数组 对象
  15. Selenium+Python自动化测试环境搭建和搭建过程遇到的问题解决
  16. 20145127《java程序设计》第十周学习总结
  17. git add -A 和 git add . 的区别
  18. AngularJS应用的启动和执行过程
  19. Parallel.ForEach使用示例
  20. BZOJ:2763-[JLOI2011]飞行路线(最短路分层图)

热门文章

  1. 关于Java无法解码(ajax编码 Java解码)
  2. java后台获取微信小程序openid
  3. 【阿里云开发】- 安装MySQL数据库
  4. React Native 开发豆瓣评分(八)首页开发
  5. iPad所有平板型号屏幕尺寸
  6. linux下安装dotnet core
  7. Java必备技能:clone浅克隆与深克隆
  8. javascript_19-DOM初体验
  9. Debug与Release版本的区别
  10. 使用Cloudera Manager搭建HDFS完全分布式集群