(1) softmax loss

<1> softmax loss的函数形式为:



<2> softmax loss对其输入zj求导:




(2) softmax_loss_layer.cpp中的Forward_cpu()函数:

 template <typename Dtype>
void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
// The forward pass computes the softmax prob values.
softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
const Dtype* prob_data = prob_.cpu_data();
//一般loss层有两个输入blob,网络的predict blob(bottom[0])和label blob(bottom[1])
const Dtype* label = bottom[]->cpu_data();
//dim = N*C*H*W / N = C*H*W
int dim = prob_.count() / outer_num_;
int count = ;
Dtype loss = ;
for (int i = ; i < outer_num_; ++i) {
for (int j = ; j < inner_num_; j++) {
const int label_value = static_cast<int>(label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
DCHECK_GE(label_value, );
DCHECK_LT(label_value, prob_.shape(softmax_axis_));
loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
top[]->mutable_cpu_data()[] = loss / get_normalizer(normalization_, count);
if (top.size() == ) {

(3) softmax_loss_layer.cpp中的Backward_cpu函数:

 template <typename Dtype>
void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (propagate_down[]) {
LOG(FATAL) << this->type()
<< " Layer cannot backpropagate to label inputs.";
if (propagate_down[]) {
Dtype* bottom_diff = bottom[]->mutable_cpu_diff();
const Dtype* prob_data = prob_.cpu_data();
//将softmax的输出prob_复制给bottom[0]的diff(梯度) blob
caffe_copy(prob_.count(), prob_data, bottom_diff);
const Dtype* label = bottom[]->cpu_data();
int dim = prob_.count() / outer_num_;
int count = ;
for (int i = ; i < outer_num_; ++i) {
for (int j = ; j < inner_num_; ++j) {
const int label_value = static_cast<int>(label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
for (int c = ; c < bottom[]->shape(softmax_axis_); ++c) {
bottom_diff[i * dim + c * inner_num_ + j] = ;
} else {
bottom_diff[i * dim + label_value * inner_num_ + j] -= ;
// Scale gradient
//top[0]->cpu_diff()[0] = N
//N / count
Dtype loss_weight = top[]->cpu_diff()[] /
get_normalizer(normalization_, count);
caffe_scal(prob_.count(), loss_weight, bottom_diff);


