通过整理LeNet、AlexNet、VGG16、googLeNet、ResNet、MLP统计出的常用算子(不包括ReLU),表格是对比。

Prelu

Cpu版

Gpu版

for (int i = 0; i < count; ++i) {

int c = (i / dim) % channels / div_factor;

top_data[i] = std::max(bottom_data[i], Dtype(0))

+ slope_data[c] * std::min(bottom_data[i], Dtype(0));

}

Kernel代码

for (int i = blockIdx.x * blockDim.x + threadIdx.x; \

i < (n);       i += blockDim.x * gridDim.x)

{

int c = (index / dim) % channels / div_factor;

out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];

}

Im2col

Cpu版:

Gpu版本

channel_size = height * width;

for (int channel = channels; channel--; data_im += channel_size) {

for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {

for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {

int input_row = -pad_h + kernel_row * dilation_h;

for (int output_rows = output_h; output_rows; output_rows--) {

if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {

for (int output_cols = output_w; output_cols; output_cols--) {

*(data_col++) = 0;

}

} else {

int input_col = -pad_w + kernel_col * dilation_w;

for (int output_col = output_w; output_col; output_col--) {

if (is_a_ge_zero_and_a_lt_b(input_col, width)) {

*(data_col++) = data_im[input_row * width + input_col];

} else {

*(data_col++) = 0;

}

input_col += stride_w;

}

}

input_row += stride_h;

}

}

kernel

for (int i = blockIdx.x * blockDim.x + threadIdx.x; \

i < (n);       i += blockDim.x * gridDim.x)

{

const int h_index = i/ width_col;

const int h_col = h_index % height_col;

const int w_col = i % width_col;

const int c_im = h_index / height_col;

const int c_col = c_im * kernel_h * kernel_w;

const int h_offset = h_col * stride_h - pad_h;

const int w_offset = w_col * stride_w - pad_w;

Dtype* data_col_ptr = data_col;

data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;

const Dtype* data_im_ptr = data_im;

data_im_ptr += (c_im * height + h_offset) * width + w_offset;

for (int i = 0; i < kernel_h; ++i) {

for (int j = 0; j < kernel_w; ++j) {

int h_im = h_offset + i * dilation_h;

int w_im = w_offset + j * dilation_w;

*data_col_ptr =

(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?

data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;

data_col_ptr += height_col * width_col;

}

}

}

Host代码:

int num_kernels = channels * height_col * width_col;

im2col_gpu_kernel<Dtype><<<(num_kernels+511)/512, 512>>>( … …);

Pool算子

Cpu版:AVE版本的,MAX类似,缺少Stochastic

Gpu版本

for (int n = 0; n < bottom[0]->num(); ++n) {

for (int c = 0; c < channels_; ++c) {

for (int ph = 0; ph < pooled_height_; ++ph) {

for (int pw = 0; pw < pooled_width_; ++pw) {

int hstart = ph * stride_h_ - pad_h_;

int wstart = pw * stride_w_ - pad_w_;

int hend = min(hstart + kernel_h_, height_);

int wend = min(wstart + kernel_w_, width_);

hstart = max(hstart, 0);

wstart = max(wstart, 0);

const int pool_index = ph * pooled_width_ + pw;

for (int h = hstart; h < hend; ++h) {

for (int w = wstart; w < wend; ++w) {   //AVE

top_data[ph * pooled_width_ + pw] +=

bottom_data[h * width_ + w];

}

}

}

}

// compute offset

bottom_data += bottom[0]->offset(0, 1);

top_data += top[0]->offset(0, 1);

}

}

const int w = index % width;

const int h = (index / width) % height;

const int c = (index / width / height) % channels;

const int n = index / width / height / channels;

const int phstart =

(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;

const int phend = min((h + pad_h) / stride_h + 1, pooled_height);

const int pwstart =

(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;

const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);

Dtype gradient = 0;

const int offset = (n * channels + c) * pooled_height * pooled_width;

const Dtype* const top_diff_slice = top_diff + offset;

if (mask) {

const int* const mask_slice = mask + offset;

for (int ph = phstart; ph < phend; ++ph) {

for (int pw = pwstart; pw < pwend; ++pw) {

if (mask_slice[ph * pooled_width + pw] == h * width + w) {

gradient += top_diff_slice[ph * pooled_width + pw];

}

}

}

}

bottom_diff[index] = gradient;

FC算子(InnerProduct)

Cpu版:

Gpu版本:和CPU版本一致

caffe_cpu_gemm<Dtype>(CblasNoTrans, transpose_ ? CblasNoTrans : CblasTrans,M_, N_, K_, (Dtype)1.,bottom_data, weight, (Dtype)0., top_data);

if (bias_term_) {

caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,bias_multiplier_.cpu_data(),this->blobs_[1]->cpu_data(), (Dtype)1., top_data);

}

Dropout算子

Cpu版:

Gpu版本

// Create random numbers

caffe_rng_bernoulli(count, 1. - threshold_, mask);

for (int i = 0; i < count; ++i) {

top_data[i] = bottom_data[i] * mask[i] * scale_;

}

Softmax算子

Cpu版:

Gpu版本

for (int i = 0; i < outer_num_; ++i) {

// initialize scale_data to the first plane

caffe_copy(inner_num_, bottom_data + i * dim, scale_data);

for (int j = 0; j < channels; j++) {

for (int k = 0; k < inner_num_; k++) {

scale_data[k] = std::max(scale_data[k],

bottom_data[i * dim + j * inner_num_ + k]);

}

}

// subtraction

caffe_cpu_gemm<Dtype>(CblasNoTrans,CblasNoTrans,channels, inner_num_,

1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);

// exponentiation

caffe_exp<Dtype>(dim, top_data, top_data);

// sum after exp

caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1.,

top_data, sum_multiplier_.cpu_data(), 0., scale_data);

// division

for (int j = 0; j < channels; j++) {

caffe_div(inner_num_, top_data, scale_data, top_data);

top_data += inner_num_;

}

}

kernel  以kernel_channel_sum为例

int n = index / spatial_dim;

int s = index % spatial_dim;

Dtype sum = 0;

for (int c = 0; c < channels; ++c) {

sum += data[(n * channels + c) * spatial_dim + s];

}

channel_sum[index] = sum;

Host代码:

kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data);

kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data);

kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, top_data);

kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data);

kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data);

Sigmoid算子

Cpu版:

Gpu版本

for (int i = 0; i < count; ++i) {

top_data[i] = sigmoid(bottom_data[i]);

}

Sigmoid:

inline Dtype sigmoid(Dtype x) {

return 0.5 * tanh(0.5 * x) + 0.5;

}

out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;

LRN算子

Cpu版:

Gpu版本:和Cpu一样

split_layer_->Forward(bottom, split_top_vec_);

square_layer_->Forward(square_bottom_vec_, square_top_vec_);//PowerLayer

pool_layer_->Forward(square_top_vec_, pool_top_vec_);

power_layer_->Forward(pool_top_vec_, power_top_vec_);

product_layer_->Forward(product_bottom_vec_, top);//EltwiseLayer

Split算子

Cpu版:

Gpu版本:和Cpu一样

for (int i = 0; i < top.size(); ++i) {

top[i]->ShareData(*bottom[0]);

}

ShareData:

void Blob<Dtype>::ShareData(const Blob& other) {

CHECK_EQ(count_, other.count());

data_ = other.data();

}

Eltwise算子

Cpu版:MAX版本的

Gpu版本

// Initialize

mask = max_idx_.mutable_cpu_data();

caffe_set(count, -1, mask);

caffe_set(count, Dtype(-FLT_MAX), top_data);

// bottom 0 & 1

bottom_data_a = bottom[0]->cpu_data();

bottom_data_b = bottom[1]->cpu_data();

for (int idx = 0; idx < count; ++idx) {

if (bottom_data_a[idx] > bottom_data_b[idx]) {

top_data[idx] = bottom_data_a[idx];  // maxval

mask[idx] = 0;  // maxid

} else {

top_data[idx] = bottom_data_b[idx];  // maxval

mask[idx] = 1;  // maxid

}

}

// bottom 2++

for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) {

bottom_data_b = bottom[blob_idx]->cpu_data();

for (int idx = 0; idx < count; ++idx) {

if (bottom_data_b[idx] > top_data[idx]) {

top_data[idx] = bottom_data_b[idx];  // maxval

mask[idx] = blob_idx;  // maxid

}

}

}

kernel

Dtype maxval = -FLT_MAX;

int maxidx = -1;

if (bottom_data_a[index] > bottom_data_b[index]) {

// only update for very first bottom_data blob (blob_idx == 0)

if (blob_idx == 0) {

maxval = bottom_data_a[index];

top_data[index] = maxval;

maxidx = blob_idx;

mask[index] = maxidx;

}

} else {

maxval = bottom_data_b[index];

top_data[index] = maxval;

maxidx = blob_idx + 1;

mask[index] = maxidx;

}

Host代码:

mask = max_idx_.mutable_gpu_data();

MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count,bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);

for (int i = 2; i < bottom.size(); ++i) {

MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);

}

SigmoidCrossEntropyLoss算子

Cpu版:

Gpu版本

for (int i = 0; i < bottom[0]->count(); ++i) {

const int target_value = static_cast<int>(target[i]);

if (has_ignore_label_ && target_value == ignore_label_) {

continue;

}

loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -

log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));

++valid_count;

}

//return max(1.0,normalizer_)

normalizer_ = get_normalizer(normalization_, valid_count);

top[0]->mutable_cpu_data()[0] = loss / normalizer_;

kernel

const int target_value = static_cast<int>(target[i]);

if (has_ignore_label_ && target_value == ignore_label_) {

loss[i] = 0;

counts[i] = 0;

} else {

loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -

log(1 + exp(input_data[i] - 2 * input_data[i] *

(input_data[i] >= 0)));

counts[i] = 1;

}

Host代码:

SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data,

has_ignore_label_, ignore_label_, count_data);

// Only launch another CUDA kernel if we actually need the valid count.

if (normalization_ == LossParameter_NormalizationMode_VALID &&

has_ignore_label_) {

caffe_gpu_asum(count, count_data, &valid_count);

} else {

valid_count = count;

}

Scale算子

Cpu版:

Gpu版本

for (int n = 0; n < outer_dim_; ++n) {

for (int d = 0; d < scale_dim_; ++d) {

const Dtype factor = scale_data[d];

caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data);

bottom_data += inner_dim_;

top_data += inner_dim_;

}

}

caffe_cpu_scale:

for (int i = 0; i < N; ++i) {

Y[i] += alpha;

}

kernel

const int scale_index = (index / inner_dim) % scale_dim;

out[index] = in[index] * scale[scale_index];

BatchNorm算子

Cpu版:代码太长待分析

Gpu版本

实现逻辑与CPU版本一致,只不过所有的函数调用都成了caffe_gpu_gemm这种

最新文章

  1. JavaScript知识架构学习路径(一)- 变量篇
  2. C++ STL stack和queue
  3. HTML5 Canvas核心技术—图形、动画与游戏开发.pdf1
  4. JavaScript基础(二)
  5. IOS百度地图语音导航
  6. 站在巨人的肩膀上,C++开源库大全
  7. 如何修改被编译后DLL文件
  8. 动态加载 js
  9. 页面跳转后样式丢失js失效
  10. POJ1222EXTENDED LIGHTS OUT(高斯消元)
  11. java异常常见面试问题
  12. Netflix性能监控工具Vector
  13. 【perl】simpleHTTP
  14. 使用Python的turtle库画圣诞树
  15. git tag用法
  16. constant函数
  17. 保存到Excel文件中
  18. Linux下的信号详解
  19. 并发编程(二):全视角解析volatile
  20. 《shiro框架》

热门文章

  1. 没有安装zip引发的一系列安装
  2. gateway 整合 websocket demo
  3. JAVA多线程程序ProgressBar2
  4. 30 道linux运维题
  5. Xcode8.1 真机测试 ,添加iOS10.3的idk到Xcode8.1中
  6. 小程序onLaunch事件的坑
  7. uni-app编译配置
  8. (转)arcengine+c# 修改存储在文件地理数据库中的ITable类型的表格中的某一列数据,逐行修改。更新属性表、修改属性表某列的值。
  9. Linux C/C++基础——文件(上)
  10. 记录几个常用的Css样式效果