本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/fengbingchun/article/details/79235028

主成分分析(Principal Components Analysis, PCA)简介可以参考: http://blog.csdn.net/fengbingchun/article/details/78977202

以下是PCA的C++实现,参考OpenCV 3.3中的cv::PCA类。

使用ORL Faces Database作为测试图像。关于ORL Faces Database的介绍可以参考: http://blog.csdn.net/fengbingchun/article/details/79008891

pca.hpp:

#ifndef FBC_NN_PCA_HPP_
#define FBC_NN_PCA_HPP_

#include <vector>
#include <string>

namespace ANN {

template<typename T = float>
class PCA {
public:
PCA() = default;
int load_data(const std::vector<std::vector<T>>& data, const std::vector<T>& labels);
int set_max_components(int max_components);
int set_retained_variance(double retained_variance);
int load_model(const std::string& model);
int train(const std::string& model);
// project into the eigenspace, thus the image becomes a "point"
int project(const std::vector<T>& vec, std::vector<T>& result) const;
// re-create the image from the "point"
int back_project(const std::vector<T>& vec, std::vector<T>& result) const;

private:
// width,height,eigen_vectors;width,height,eigen_values;width,height,means
int save_model(const std::string& model) const;
void calculate_covariance_matrix(std::vector<std::vector<T>>& covar, bool scale = false); // calculate covariance matrix
int eigen(const std::vector<std::vector<T>>& mat, bool sort_ = true); // calculate eigen vectors and eigen values
// generalized matrix multiplication: dst = alpha*src1.t()*src2 + beta*src3.t()
int gemm(const std::vector<std::vector<T>>& src1, const std::vector<std::vector<T>>& src2, double alpha,
const std::vector<std::vector<T>>& src3, double beta, std::vector<std::vector<T>>& dst, int flags = 0) const;
int gemm(const std::vector<T>& src1, const std::vector<std::vector<T>>& src2, double alpha,
const std::vector<T>& src3, double beta, std::vector<T>& dst, int flags = 0) const; // GEMM_2_T: flags = 1
int normalize(T* dst, int length);
int computeCumulativeEnergy() const;
int subtract(const std::vector<T>& vec1, const std::vector<T>& vec2, std::vector<T>& result) const;

typedef struct Size_ {
int width;
int height;
} Size_;

std::vector<std::vector<T>> data;
std::vector<T> labels;
int samples_num = 0;
int features_length = 0;
double retained_variance = -1.; // percentage of variance that PCA should retain
int max_components = -1; // maximum number of components that PCA should retain
std::vector<std::vector<T>> eigen_vectors; // eigenvectors of the covariation matrix
std::vector<T> eigen_values; // eigenvalues of the covariation matrix
std::vector<T> mean;
int covar_flags = 0; // when features_length > samples_num, covar_flags is 0, otherwise is 1
};

} // namespace ANN

#endif // FBC_NN_PCA_HPP_
pca.cpp:
#include "pca.hpp"
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
#include <memory>
#include <fstream>
#include "common.hpp"

namespace ANN {

template<typename T>
int PCA<T>::load_data(const std::vector<std::vector<T>>& data, const std::vector<T>& labels)
{
this->samples_num = data.size();
this->features_length = data[0].size();
if (samples_num > features_length) {
fprintf(stderr, "now only support samples_num <= features_length\n");
return -1;
}

this->data.resize(this->samples_num);
for (int i = 0; i < this->samples_num; ++i) {
this->data[i].resize(this->features_length);
memcpy(this->data[i].data(), data[i].data(), sizeof(T)* this->features_length);
}

this->labels.resize(this->samples_num);
memcpy(this->labels.data(), labels.data(), sizeof(T)*this->samples_num);

return 0;
}

template<typename T>
int PCA<T>::set_max_components(int max_components)
{
CHECK(data.size() > 0);
int count = std::min(features_length, samples_num);
if (max_components > 0) {
this->max_components = std::min(count, max_components);
}
this->retained_variance = -1.;
}

template<typename T>
int PCA<T>::set_retained_variance(double retained_variance)
{
CHECK(retained_variance > 0 && retained_variance <= 1);
this->retained_variance = retained_variance;
this->max_components = -1;
}

template<typename T>
void PCA<T>::calculate_covariance_matrix(std::vector<std::vector<T>>& covar, bool scale)
{
const int rows = samples_num;
const int cols = features_length;
const int nsamples = rows;
double scale_ = 1.;
if (scale) scale_ = 1. / (nsamples /*- 1*/);

mean.resize(cols, (T)0.);
for (int w = 0; w < cols; ++w) {
for (int h = 0; h < rows; ++h) {
mean[w] += data[h][w];
}
}

for (auto& value : mean) {
value = 1. / rows * value;
}

// int dsize = ata ? src.cols : src.rows; // ata = false;
int dsize = rows;
covar.resize(dsize);
for (int i = 0; i < dsize; ++i) {
covar[i].resize(dsize, (T)0.);
}

Size_ size{ data[0].size(), data.size() };

T* tdst = covar[0].data();
int delta_cols = mean.size();
T delta_buf[4];
int delta_shift = delta_cols == size.width ? 4 : 0;
std::unique_ptr<T[]> buf(new T[size.width]);
T* row_buf = buf.get();

for (int i = 0; i < size.height; ++i) {
const T* tsrc1 = data[i].data();
const T* tdelta1 = mean.data();

for (int k = 0; k < size.width; ++k) {
row_buf[k] = tsrc1[k] - tdelta1[k];
}

for (int j = i; j < size.height; ++j) {
double s = 0;
const T* tsrc2 = data[j].data();
const T* tdelta2 = mean.data();

for (int k = 0; k < size.width; ++k) {
s += (double)row_buf[k] * (tsrc2[k] - tdelta2[k]);
}

tdst[j] = (T)(s * scale_);
}

if (i < covar.size()-1) {
tdst = covar[i + 1].data();
}
}
}

namespace {

template<typename _Tp>
static inline _Tp hypot_(_Tp a, _Tp b)
{
a = std::abs(a);
b = std::abs(b);
if (a > b) {
b /= a;
return a*std::sqrt(1 + b*b);
}
if (b > 0) {
a /= b;
return b*std::sqrt(1 + a*a);
}
return 0;
}

} // namespace

template<typename T>
int PCA<T>::eigen(const std::vector<std::vector<T>>& mat, bool sort_ = true)
{
using _Tp = T; // typedef T _Tp;
auto n = mat.size();
for (const auto& m : mat) {
if (m.size() != n) {
fprintf(stderr, "mat must be square and it should be a real symmetric matrix\n");
return -1;
}
}

eigen_values.resize(n, (T)0.);
std::vector<T> V(n*n, (T)0.);
for (int i = 0; i < n; ++i) {
V[n * i + i] = (_Tp)1;
eigen_values[i] = mat[i][i];
}

const _Tp eps = std::numeric_limits<_Tp>::epsilon();
int maxIters{ (int)n * (int)n * 30 };
_Tp mv{ (_Tp)0 };
std::vector<int> indR(n, 0), indC(n, 0);
std::vector<_Tp> A;
for (int i = 0; i < n; ++i) {
A.insert(A.begin() + i * n, mat[i].begin(), mat[i].end());
}

for (int k = 0; k < n; ++k) {
int m, i;
if (k < n - 1) {
for (m = k + 1, mv = std::abs(A[n*k + m]), i = k + 2; i < n; i++) {
_Tp val = std::abs(A[n*k + i]);
if (mv < val)
mv = val, m = i;
}
indR[k] = m;
}
if (k > 0) {
for (m = 0, mv = std::abs(A[k]), i = 1; i < k; i++) {
_Tp val = std::abs(A[n*i + k]);
if (mv < val)
mv = val, m = i;
}
indC[k] = m;
}
}

if (n > 1) for (int iters = 0; iters < maxIters; iters++) {
int k, i, m;
// find index (k,l) of pivot p
for (k = 0, mv = std::abs(A[indR[0]]), i = 1; i < n - 1; i++) {
_Tp val = std::abs(A[n*i + indR[i]]);
if (mv < val)
mv = val, k = i;
}
int l = indR[k];
for (i = 1; i < n; i++) {
_Tp val = std::abs(A[n*indC[i] + i]);
if (mv < val)
mv = val, k = indC[i], l = i;
}

_Tp p = A[n*k + l];
if (std::abs(p) <= eps)
break;
_Tp y = (_Tp)((eigen_values[l] - eigen_values[k])*0.5);
_Tp t = std::abs(y) + hypot_(p, y);
_Tp s = hypot_(p, t);
_Tp c = t / s;
s = p / s; t = (p / t)*p;
if (y < 0)
s = -s, t = -t;
A[n*k + l] = 0;

eigen_values[k] -= t;
eigen_values[l] += t;

_Tp a0, b0;

#undef rotate
#define rotate(v0, v1) a0 = v0, b0 = v1, v0 = a0*c - b0*s, v1 = a0*s + b0*c

// rotate rows and columns k and l
for (i = 0; i < k; i++)
rotate(A[n*i + k], A[n*i + l]);
for (i = k + 1; i < l; i++)
rotate(A[n*k + i], A[n*i + l]);
for (i = l + 1; i < n; i++)
rotate(A[n*k + i], A[n*l + i]);

// rotate eigenvectors
for (i = 0; i < n; i++)
rotate(V[n*k + i], V[n*l + i]);

#undef rotate

for (int j = 0; j < 2; j++) {
int idx = j == 0 ? k : l;
if (idx < n - 1) {
for (m = idx + 1, mv = std::abs(A[n*idx + m]), i = idx + 2; i < n; i++) {
_Tp val = std::abs(A[n*idx + i]);
if (mv < val)
mv = val, m = i;
}
indR[idx] = m;
}
if (idx > 0) {
for (m = 0, mv = std::abs(A[idx]), i = 1; i < idx; i++) {
_Tp val = std::abs(A[n*i + idx]);
if (mv < val)
mv = val, m = i;
}
indC[idx] = m;
}
}
}

// sort eigenvalues & eigenvectors
if (sort_) {
for (int k = 0; k < n - 1; k++) {
int m = k;
for (int i = k + 1; i < n; i++) {
if (eigen_values[m] < eigen_values[i])
m = i;
}
if (k != m) {
std::swap(eigen_values[m], eigen_values[k]);
for (int i = 0; i < n; i++)
std::swap(V[n*m + i], V[n*k + i]);
}
}
}

eigen_vectors.resize(n);
for (int i = 0; i < n; ++i) {
eigen_vectors[i].resize(n);
eigen_vectors[i].assign(V.begin() + i * n, V.begin() + i * n + n);
}

return 0;
}

template<typename T>
int PCA<T>::gemm(const std::vector<std::vector<T>>& src1, const std::vector<std::vector<T>>& src2, double alpha,
const std::vector<std::vector<T>>& src3, double beta, std::vector<std::vector<T>>& dst, int flags) const
{
CHECK(flags == 0); // now only support flags = 0
CHECK(typeid(T).name() == typeid(double).name() || typeid(T).name() == typeid(float).name()); // T' type can only be float or double
CHECK(beta == 0. && src3.size() == 0);

Size_ a_size{ src1[0].size(), src1.size() }, d_size{ src2[0].size(), a_size.height };
int len{ (int)src2.size() };
CHECK(a_size.height == len);
CHECK(d_size.height == dst.size() && d_size.width == dst[0].size());

for (int y = 0; y < d_size.height; ++y) {
for (int x = 0; x < d_size.width; ++x) {
dst[y][x] = 0.;
for (int t = 0; t < d_size.height; ++t) {
dst[y][x] += src1[y][t] * src2[t][x];
}

dst[y][x] *= alpha;
}
}

return 0;
}
template<typename T>
int PCA<T>::gemm(const std::vector<T>& src1, const std::vector<std::vector<T>>& src2, double alpha,
const std::vector<T>& src3, double beta, std::vector<T>& dst, int flags = 0) const
{
CHECK(flags == 0 || flags == 1); // when flags = 1, GEMM_2_T
CHECK(typeid(T).name() == typeid(double).name() || typeid(T).name() == typeid(float).name()); // T' type can only be float or double

Size_ a_size{ src1.size(), 1 }, d_size;
int len = 0;

switch (flags) {
case 0:
d_size = Size_{ src2[0].size(), a_size.height };
len = src2.size();
CHECK(a_size.width == len);
break;
case 1:
d_size = Size_{ src2.size(), a_size.height };
len = src2[0].size();
CHECK(a_size.width == len);
break;
}

if (!src3.empty()) {
CHECK(src3.size() == d_size.width);
}

dst.resize(d_size.width);

const T* src3_ = nullptr;
std::vector<T> tmp(dst.size(), (T)0.);
if (src3.empty()) {
src3_ = tmp.data();
} else {
src3_ = src3.data();
}

if (src1.size() == src2.size()) {
for (int i = 0; i < dst.size(); ++i) {
dst[i] = (T)0.;
for (int j = 0; j < src2.size(); ++j) {
dst[i] += src1[j] * src2[j][i];
}
dst[i] *= alpha;
dst[i] += beta * src3_[i];
}
} else {
for (int i = 0; i < dst.size(); ++i) {
dst[i] = (T)0.;
for (int j = 0; j < src1.size(); ++j) {
dst[i] += src1[j] * src2[i][j];
}
dst[i] *= alpha;
dst[i] += beta * src3_[i];
}
}

return 0;
}

template<typename T>
int PCA<T>::normalize(T* dst, int length)
{
T s = (T)0., a = (T)1.;
for (int i = 0; i < length; ++i) {
s += dst[i] * dst[i];
}

s = std::sqrt(s);
s = s > DBL_EPSILON ? a / s : 0.;

for (int i = 0; i < length; ++i) {
dst[i] *= s;
}

return 0;
}

template<typename T>
int PCA<T>::computeCumulativeEnergy() const
{
std::vector<T> g(eigen_values.size(), (T)0.);
for (int ig = 0; ig < eigen_values.size(); ++ig) {
for (int im = 0; im <= ig; ++im) {
g[ig] += eigen_values[im];
}
}

int L{ 0 };
for (L = 0; L < eigen_values.size(); ++L) {
double energy = g[L] / g[eigen_values.size() - 1];
if (energy > retained_variance) break;
}

L = std::max(2, L);

return L;
}

template<typename T>
int PCA<T>::train(const std::string& model)
{
CHECK(retained_variance > 0. || max_components > 0);
int count = std::min(features_length, samples_num), out_count = count;
if (max_components > 0) out_count = std::min(count, max_components);
covar_flags = 0;
if (features_length <= samples_num) covar_flags = 1;

std::vector<std::vector<T>> covar(count); // covariance matrix
calculate_covariance_matrix(covar, true);
eigen(covar, true);

std::vector<std::vector<T>> tmp_data(samples_num), evects1(count);
for (int i = 0; i < samples_num; ++i) {
tmp_data[i].resize(features_length);
evects1[i].resize(features_length);

for (int j = 0; j < features_length; ++j) {
tmp_data[i][j] = data[i][j] - mean[j];
}
}

gemm(eigen_vectors, tmp_data, 1., std::vector<std::vector<T>>(), 0., evects1, 0);

eigen_vectors.resize(evects1.size());
for (int i = 0; i < eigen_vectors.size(); ++i) {
eigen_vectors[i].resize(evects1[i].size());
memcpy(eigen_vectors[i].data(), evects1[i].data(), sizeof(T)* evects1[i].size());
}

// normalize all eigenvectors
if (retained_variance > 0) {
for (int i = 0; i < eigen_vectors.size(); ++i) {
normalize(eigen_vectors[i].data(), eigen_vectors[i].size());
}

// compute the cumulative energy content for each eigenvector
int L = computeCumulativeEnergy();
eigen_values.resize(L);
eigen_vectors.resize(L);
} else {
for (int i = 0; i < out_count; ++i) {
normalize(eigen_vectors[i].data(), eigen_vectors[i].size());
}

if (count > out_count) {
eigen_values.resize(out_count);
eigen_vectors.resize(out_count);
}
}

save_model(model);

return 0;
}

template<typename T>
int PCA<T>::subtract(const std::vector<T>& vec1, const std::vector<T>& vec2, std::vector<T>& result) const
{
CHECK(vec1.size() == vec2.size() && vec1.size() == result.size());

for (int i = 0; i < vec1.size(); ++i) {
result[i] = vec1[i] - vec2[i];
}

return 0;
}

template<typename T>
int PCA<T>::project(const std::vector<T>& vec, std::vector<T>& result) const
{
CHECK(!mean.empty() && !eigen_vectors.empty() && mean.size() == vec.size());

std::vector<T> tmp_data(mean.size());
subtract(vec, mean, tmp_data);

gemm(tmp_data, eigen_vectors, 1, std::vector<T>(), 0, result, 1);

return 0;
}

template<typename T>
int PCA<T>::back_project(const std::vector<T>& vec, std::vector<T>& result) const
{
CHECK(!mean.empty() && !eigen_vectors.empty() && eigen_vectors.size() == vec.size());
gemm(vec, eigen_vectors, 1, mean, 1, result, 0);

return 0;
}

template<typename T>
int PCA<T>::load_model(const std::string& model)
{
std::ifstream file(model.c_str(), std::ios::in | std::ios::binary);
if (!file.is_open()) {
fprintf(stderr, "open file fail: %s\n", model.c_str());
return -1;
}

int width = 0, height = 0;
file.read((char*)&width, sizeof(width) * 1);
file.read((char*)&height, sizeof(height) * 1);
std::unique_ptr<T[]> data(new T[width * height]);
file.read((char*)data.get(), sizeof(T)* width * height);
eigen_vectors.resize(height);
for (int i = 0; i < height; ++i) {
eigen_vectors[i].resize(width);
T* p = data.get() + i * width;
memcpy(eigen_vectors[i].data(), p, sizeof(T)* width);
}

file.read((char*)&width, sizeof(width));
file.read((char*)&height, sizeof(height));
CHECK(height == 1);
eigen_values.resize(width);
file.read((char*)eigen_values.data(), sizeof(T)* width * height);

file.read((char*)&width, sizeof(width));
file.read((char*)&height, sizeof(height));
CHECK(height == 1);
mean.resize(width);
file.read((char*)mean.data(), sizeof(T)* width * height);

file.close();

return 0;
}

template<typename T>
int PCA<T>::save_model(const std::string& model) const
{
std::ofstream file(model.c_str(), std::ios::out | std::ios::binary);
if (!file.is_open()) {
fprintf(stderr, "open file fail: %s\n", model.c_str());
return -1;
}

int width = eigen_vectors[0].size(), height = eigen_vectors.size();
std::unique_ptr<T[]> data(new T[width * height]);
for (int i = 0; i < height; ++i) {
T* p = data.get() + i * width;
memcpy(p, eigen_vectors[i].data(), sizeof(T) * width);
}
file.write((char*)&width, sizeof(width));
file.write((char*)&height, sizeof(height));
file.write((char*)data.get(), sizeof(T)* width * height);

width = eigen_values.size(), height = 1;
file.write((char*)&width, sizeof(width));
file.write((char*)&height, sizeof(height));
file.write((char*)eigen_values.data(), sizeof(T)* width * height);

width = mean.size(), height = 1;
file.write((char*)&width, sizeof(width));
file.write((char*)&height, sizeof(height));
file.write((char*)mean.data(), sizeof(T)* width * height);

file.close();
return 0;
}

template class PCA<float>;
template class PCA<double>;

} // namespace ANN
main.cpp:
#include "funset.hpp"
#include <iostream>
#include "perceptron.hpp"
#include "BP.hpp""
#include "CNN.hpp"
#include "linear_regression.hpp"
#include "naive_bayes_classifier.hpp"
#include "logistic_regression.hpp"
#include "common.hpp"
#include "knn.hpp"
#include "decision_tree.hpp"
#include "pca.hpp"
#include <opencv2/opencv.hpp>
// =============================== PCA(Principal Components Analysis) ===================
namespace {
void normalize(const std::vector<float>& src, std::vector<unsigned char>& dst)
{
dst.resize(src.size());
double dmin = 0, dmax = 255;
double smin = src[0], smax = smin;
for (int i = 1; i < src.size(); ++i) {
if (smin > src[i]) smin = src[i];
if (smax < src[i]) smax = src[i];
}
double scale = (dmax - dmin) * (smax - smin > DBL_EPSILON ? 1. / (smax - smin) : 0);
double shift = dmin - smin * scale;
for (int i = 0; i < src.size(); ++i) {
dst[i] = static_cast<unsigned char>(src[i] * scale + shift);
}
}
} // namespace
int test_pca()
{
const std::string image_path{ "E:/GitCode/NN_Test/data/database/ORL_Faces/" };
const std::string image_name{ "1.pgm" };
std::vector<cv::Mat> images;
for (int i = 1; i <= 15; ++i) {
std::string name = image_path + "s" + std::to_string(i) + "/" + image_name;
cv::Mat mat = cv::imread(name, 0);
if (!mat.data) {
fprintf(stderr, "read image fail: %s\n", name.c_str());
return -1;
}
images.emplace_back(mat);
}
save_images(images, "E:/GitCode/NN_Test/data/pca_src.jpg", 5);
cv::Mat data(images.size(), images[0].rows * images[0].cols, CV_32FC1);
for (int i = 0; i < images.size(); ++i) {
cv::Mat image_row = images[i].clone().reshape(1, 1);
cv::Mat row_i = data.row(i);
image_row.convertTo(row_i, CV_32F);
}
int features_length = images[0].rows * images[0].cols;
std::vector<std::vector<float>> data_(images.size());
std::vector<float> labels(images.size(), 0.f);
for (int i = 0; i < images.size(); ++i) {
data_[i].resize(features_length);
memcpy(data_[i].data(), data.row(i).data, sizeof(float)* features_length);
}

const std::string save_model_file{ "E:/GitCode/NN_Test/data/pca.model" };
ANN::PCA<float> pca;
pca.load_data(data_, labels);
double retained_variance{ 0.95 };
pca.set_retained_variance(retained_variance);
pca.train(save_model_file);
const std::string read_model_file{ save_model_file };
ANN::PCA<float> pca2;
pca2.load_model(read_model_file);
std::vector<cv::Mat> result(images.size());
for (int i = 0; i < images.size(); ++i) {
std::vector<float> point, reconstruction;
pca2.project(data_[i], point);
pca2.back_project(point, reconstruction);
std::vector<unsigned char> dst;
normalize(reconstruction, dst);
cv::Mat tmp(images[i].rows, images[i].cols, CV_8UC1, dst.data());
tmp.copyTo(result[i]);
}
save_images(result, "E:/GitCode/NN_Test/data/pca_result.jpg", 5);
return 0;
}
执行结果如下,上三行为原始图像,下三行为使用PCA重建图像的结果,经比较与OpenCV 3.3结果一致:

GitHub: https://github.com/fengbingchun/NN_Test
---------------------
作者:fengbingchun
来源:CSDN
原文:https://blog.csdn.net/fengbingchun/article/details/79235028
版权声明:本文为博主原创文章,转载请附上博文链接!

最新文章

  1. 从零开始编写自己的C#框架(27)——什么是开发框架
  2. 【转】ofbiz数据库表结构设计
  3. CLR简介(一)
  4. 完善SQL农历转换函数
  5. 高效图片轮播,两个imageView实现
  6. HashMap和HashTable区别
  7. ZJU-PAT 1065. A+B and C (64bit) (20)
  8. find 与 tar命令连用
  9. Java 编程的动态性,第 6 部分: 利用 Javassist 进行面向方面的更改--转载
  10. JS复习:第六章
  11. Spark计算模型
  12. COGS2187 [HZOI 2015] 帕秋莉的超级多项式
  13. Lock与synchronized的区别(浅谈)
  14. selenium 操作键盘
  15. Scala 上下文界定
  16. Mysql 经典案例总结(学习之前需要有Mysql基础)01
  17. PAT B1010 一元多项式求导 (25 分)
  18. 使用NewLife网络库构建可靠的自动售货机Socket服务端(一)
  19. 基础爬虫,谁学谁会,用requests、正则表达式爬取豆瓣Top250电影数据!
  20. Python基础灬补充(循环、格式化输出)

热门文章

  1. 二、hibernate的常用API
  2. 函数高阶(函数,改变函数this指向,高阶函数,闭包,递归)
  3. 1、Spring MVC的web.xml配置详解(转)
  4. vue项目从0开始记录
  5. express 的路由学习
  6. webpack配置(使用react,es6的项目)
  7. Zookeeper用作注册中心的原理
  8. 记录一次项目中dubbo-admin实战部署
  9. linux文件目录颜色及特殊权限对应的颜色
  10. js-统计中文,英文,字符的个数