本文首发于个人博客https://kezunlin.me/post/61d55ab4/,欢迎阅读!

opencv mat for loop

Series

Guide

Mat

  • for gray image, use type <uchar>
  • for RGB color image,use type <Vec3b>

gray format storage

color format storage: BGR

we can use method isContinuous() to judge whether the memory buffer is continuous or not.

color space reduction

uchar color_space_reduction(uchar pixel)
{
/*
0-9 ===>0
10-19===>10
20-29===>20
...
240-249===>24
250-255===>25 map from 256*256*256===>26*26*26
*/ int divideWith = 10;
uchar new_pixel = (pixel / divideWith)*divideWith;
return new_pixel;
}

color table

void get_color_table()
{
// cache color value in table[256]
int divideWith = 10;
uchar table[256];
for (int i = 0; i < 256; ++i)
table[i] = divideWith* (i / divideWith);
}

C++

ptr []

// C ptr []: faster but not safe
Mat& ScanImageAndReduce_Cptr(Mat& I, const uchar* const table)
{
// accept only char type matrices
CV_Assert(I.depth() != sizeof(uchar));
int channels = I.channels();
int nRows = I.rows;
int nCols = I.cols* channels;
if (I.isContinuous())
{
nCols *= nRows;
nRows = 1;
}
int i, j;
uchar* p;
for (i = 0; i < nRows; ++i)
{
p = I.ptr<uchar>(i);
for (j = 0; j < nCols; ++j)
{
p[j] = table[p[j]];
}
}
return I;
}

ptr ++

// C ptr ++: faster but not safe
Mat& ScanImageAndReduce_Cptr2(Mat& I, const uchar* const table)
{
// accept only char type matrices
CV_Assert(I.depth() != sizeof(uchar));
int channels = I.channels();
int nRows = I.rows;
int nCols = I.cols* channels;
if (I.isContinuous())
{
nCols *= nRows;
nRows = 1;
}
uchar* start = I.ptr<uchar>(0); // same as I.ptr<uchar>(0,0)
uchar* end = start + nRows * nCols;
for (uchar* p=start; p < end; ++p)
{
*p = table[*p];
}
return I;
}

at(i,j)

// at<uchar>(i,j): random access, slow
Mat& ScanImageAndReduce_atRandomAccess(Mat& I, const uchar* const table)
{
// accept only char type matrices
CV_Assert(I.depth() != sizeof(uchar));
const int channels = I.channels();
switch (channels)
{
case 1:
{
for (int i = 0; i < I.rows; ++i)
for (int j = 0; j < I.cols; ++j)
I.at<uchar>(i, j) = table[I.at<uchar>(i, j)];
break;
}
case 3:
{
Mat_<Vec3b> _I = I; for (int i = 0; i < I.rows; ++i)
for (int j = 0; j < I.cols; ++j)
{
_I(i, j)[0] = table[_I(i, j)[0]];
_I(i, j)[1] = table[_I(i, j)[1]];
_I(i, j)[2] = table[_I(i, j)[2]];
}
I = _I;
break;
}
}
return I;
}

Iterator

// MatIterator_<uchar>: safe but slow
Mat& ScanImageAndReduce_Iterator(Mat& I, const uchar* const table)
{
// accept only char type matrices
CV_Assert(I.depth() != sizeof(uchar));
const int channels = I.channels();
switch (channels)
{
case 1:
{
MatIterator_<uchar> it, end;
for (it = I.begin<uchar>(), end = I.end<uchar>(); it != end; ++it)
*it = table[*it];
break;
}
case 3:
{
MatIterator_<Vec3b> it, end;
for (it = I.begin<Vec3b>(), end = I.end<Vec3b>(); it != end; ++it)
{
(*it)[0] = table[(*it)[0]];
(*it)[1] = table[(*it)[1]];
(*it)[2] = table[(*it)[2]];
}
}
}
return I;
}

opencv LUT

// LUT
Mat& ScanImageAndReduce_LUT(Mat& I, const uchar* const table)
{
Mat lookUpTable(1, 256, CV_8U);
uchar* p = lookUpTable.data;
for (int i = 0; i < 256; ++i)
p[i] = table[i]; cv::LUT(I, lookUpTable, I);
return I;
}

forEach

forEach method of the Mat class that utilizes all the cores on your machine to apply any function at every pixel.

// Parallel execution with function object.
struct ForEachOperator
{
uchar m_table[256];
ForEachOperator(const uchar* const table)
{
for (size_t i = 0; i < 256; i++)
{
m_table[i] = table[i];
}
} void operator ()(uchar& p, const int * position) const
{
// Perform a simple operation
p = m_table[p];
}
}; // forEach use multiple processors, very fast
Mat& ScanImageAndReduce_forEach(Mat& I, const uchar* const table)
{
I.forEach<uchar>(ForEachOperator(table));
return I;
}

forEach with lambda

// forEach lambda use multiple processors, very fast (lambda slower than ForEachOperator)
Mat& ScanImageAndReduce_forEach_with_lambda(Mat& I, const uchar* const table)
{
I.forEach<uchar>
(
[=](uchar &p, const int * position) -> void
{
p = table[p];
}
);
return I;
}

time cost

no foreach

[1 Cptr   ] times=5000, total_cost=988 ms, avg_cost=0.1976 ms
[1 Cptr2 ] times=5000, total_cost=1704 ms, avg_cost=0.3408 ms
[2 atRandom] times=5000, total_cost=9611 ms, avg_cost=1.9222 ms
[3 Iterator] times=5000, total_cost=20195 ms, avg_cost=4.039 ms
[4 LUT ] times=5000, total_cost=899 ms, avg_cost=0.1798 ms [1 Cptr ] times=10000, total_cost=2425 ms, avg_cost=0.2425 ms
[1 Cptr2 ] times=10000, total_cost=3391 ms, avg_cost=0.3391 ms
[2 atRandom] times=10000, total_cost=20024 ms, avg_cost=2.0024 ms
[3 Iterator] times=10000, total_cost=39980 ms, avg_cost=3.998 ms
[4 LUT ] times=10000, total_cost=103 ms, avg_cost=0.0103 ms

foreach

[5 forEach     ] times=200000, total_cost=199 ms, avg_cost=0.000995 ms
[5 forEach lambda] times=200000, total_cost=521 ms, avg_cost=0.002605 ms [5 forEach ] times=20000, total_cost=17 ms, avg_cost=0.00085 ms
[5 forEach lambda] times=20000, total_cost=23 ms, avg_cost=0.00115 ms

results

Loop Type | Time Cost (us)

:----: |

ptr [] | 242

ptr ++ | 339

at | 2002

iterator | 3998

LUT | 10

forEach | 0.85

forEach lambda | 1.15

forEach is 10x times faster than LUT, 240~340x times faster than ptr [] and ptr ++, and 2000~4000x times faster than at and iterator.

code

code here

Python

pure python

# import the necessary packages
import matplotlib.pyplot as plt
import cv2
print(cv2.__version__) %matplotlib inline
3.4.2
# load the original image, convert it to grayscale, and display
# it inline
image = cv2.imread("cat.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
print(image.shape)
#plt.imshow(image, cmap="gray")
(360, 480)
%load_ext cython
The cython extension is already loaded. To reload it, use:
%reload_ext cython
%%cython -a

def threshold_python(T, image):
# grab the image dimensions
h = image.shape[0]
w = image.shape[1] # loop over the image, pixel by pixel
for y in range(0, h):
for x in range(0, w):
# threshold the pixel
image[y, x] = 255 if image[y, x] >= T else 0 # return the thresholded image
return image
%timeit threshold_python(5, image)
263 ms ± 20.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

cython

%%cython -a

import cython

@cython.boundscheck(False)
cpdef unsigned char[:, :] threshold_cython(int T, unsigned char [:, :] image):
# set the variable extension types
cdef int x, y, w, h # grab the image dimensions
h = image.shape[0]
w = image.shape[1] # loop over the image
for y in range(0, h):
for x in range(0, w):
# threshold the pixel
image[y, x] = 255 if image[y, x] >= T else 0 # return the thresholded image
return image

numba

%timeit threshold_cython(5, image)
150 µs ± 7.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
from numba import njit

@njit
def threshold_njit(T, image):
# grab the image dimensions
h = image.shape[0]
w = image.shape[1] # loop over the image, pixel by pixel
for y in range(0, h):
for x in range(0, w):
# threshold the pixel
image[y, x] = 255 if image[y, x] >= T else 0 # return the thresholded image
return image
%timeit threshold_njit(5, image)
43.5 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

numpy

def threshold_numpy(T, image):
image[image > T] = 255
return image
%timeit threshold_numpy(5, image)
111 µs ± 334 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

conclusions

image = cv2.imread("cat.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
print(image.shape) %timeit threshold_python(5, image)
%timeit threshold_cython(5, image)
%timeit threshold_njit(5, image)
%timeit threshold_numpy(5, image)
(360, 480)
251 ms ± 6.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
143 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.8 µs ± 284 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
113 µs ± 957 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
image = cv2.imread("big.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
print(image.shape) %timeit threshold_python(5, image)
%timeit threshold_cython(5, image)
%timeit threshold_njit(5, image)
%timeit threshold_numpy(5, image)
(2880, 5120)
21.8 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
12.3 ms ± 231 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.91 ms ± 66.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
10.3 ms ± 179 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

60,480

  • python: 251 ms
  • cython: 143 us
  • numba: 43 us
  • numpy: 113 us

2880, 5120

  • python: 21 s
  • cython: 12 ms
  • numba: 4 ms
  • numpy: 10 ms

Reference

History

  • 20180823: created.

Copyright

最新文章

  1. 5分钟上手写ECharts的第一个图表
  2. ios开发之UIView的frame、bounds跟center属性的区别(附图)
  3. 【Java】整理关于java的String类,equals函数和比较操作符的区别
  4. SVN使用Tips
  5. Jquery--仿制360右下角弹出窗口
  6. CCS Debug Assertion Failed
  7. java设计模式-----14、桥接模式
  8. 如何执行Python代码
  9. Web项目中得到访问者的真实ip
  10. OpenStack-Ocata版+CentOS7.6 云平台环境搭建 — 6.在计算节点上安装并配置计算服务Nova
  11. JQuery 实现 倒计时 按钮具体方法
  12. SQLServer SELECT @@IDENTITY 遇到的坑
  13. mac 下安装pip
  14. @JVM中的几种垃圾收集算法
  15. SUBSTRING_INDEX()
  16. 公共cdn的js和css库
  17. 第145天:jQuery.touchSlider触屏满屏左右滚动幻灯片
  18. visual studio xcopy /exclude测试
  19. access_ok | 检查用户空间内存块是否可用
  20. 用PHP编写登陆界面

热门文章

  1. 刷14道leetcode的总结
  2. HashMap - 类注释
  3. 基于TORCS和Torch7实现端到端连续动作自动驾驶深度强化学习模型(A3C)的训练
  4. 【IDEA】IDEA自动生成文档注释的设置方法
  5. Nexus 上传项目到私服
  6. NodeJs 实现 WebSocket 即时通讯(版本一)
  7. Vue入坑第一篇
  8. null与undefined的区别?
  9. Vs使用EF来操作MySql(经验 )
  10. [2018-01-08] Python强化周的第一天