c++ 中的高维数组(2)
数值计算中,常常需要操作高维数组,C++ 很容易写出来占用内存多,而且速度慢的程序。
#include <iostream>
using namespace std;
class Video {
public:
Video(size_t num_of_channels, size_t width, size_t height,
size_t num_of_colors)
: num_of_channels_(num_of_channels),
width_(width),
height_(height),
num_of_colors_(num_of_colors) {
buf_ = new char***[num_of_channels];
for (size_t c = 0; c < num_of_channels; ++c) {
buf_[c] = new char**[width];
for (size_t w = 0; w < width; ++w) {
buf_[c][w] = new char*[height];
for (size_t h = 0; h < height; ++h) {
buf_[c][w][h] = new char[num_of_colors];
for (size_t r = 0; r < num_of_colors; ++r) {
buf_[c][w][h][r] = 0.0;
}
}
}
}
}
~Video() {
for (size_t c = 0; c < num_of_channels_; ++c) {
for (size_t w = 0; w < width_; ++w) {
for (size_t h = 0; h < height_; ++h) {
delete[] buf_[c][w][h];
}
delete[] buf_[c][w];
}
delete[] buf_[c];
}
delete[] buf_;
}
private:
const size_t num_of_channels_;
const size_t width_;
const size_t height_;
const size_t num_of_colors_;
char **** buf_;
};
int main(int argc, char *argv[])
{
Video v(10,1920,1080,3);
return 0;
}
我碰到过一些真实的,代码量很大的项目,至少三次看到上面这种模式。这种方法占用内存多,速度慢。
可以用一维数组模拟高维数组。
#include <iostream>
using namespace std;
class Video {
public:
Video(size_t num_of_channels, size_t width, size_t height,
size_t num_of_colors)
: num_of_channels_(num_of_channels),
width_(width),
height_(height),
num_of_colors_(num_of_colors) {
buf_ = new char[num_of_channels * width * height * num_of_colors];
for (size_t c = 0; c < num_of_channels; ++c) {
for (size_t w = 0; w < width; ++w) {
for (size_t h = 0; h < height; ++h) {
for (size_t r = 0; r < num_of_colors; ++r) {
buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r] = 0.0;
}
}
}
}
}
~Video() {
delete[] buf_;
}
private:
const size_t num_of_channels_;
const size_t width_;
const size_t height_;
const size_t num_of_colors_;
char * buf_;
};
int main(int argc, char *argv[])
{
Video v(10,1920,1080,3);
return 0;
}
我们看看速度
% c++ -O3 -std=c++11 high_dim_array_1.cpp
% ./a.out
real 0m2.873s
user 0m2.637s
sys 0m0.226s
+ c++ -O3 -std=c++11 high_dim_array_2.cpp
+ ./a.out
real 0m0.004s
user 0m0.001s
sys 0m0.001s
和上一篇文章里面不同,经过优化开关的代码,二者差距更大。
我们分析一下内存使用。对于数据来说,二者是一样的。但是第一种方案,额外存储了很多内存指针。
new char***[num_of_channels];
需要 10 个指针。- 同样道理,宽度层,需要 10 * 1920 个指针
- 高度层,需要 10*1920*1080 个指针
- 最后一层,没有额外的指针空间,因为这些就是原始数据了。
可以看到,每个指针 8 个字节(64位机器),我们额外需要大约 160M+ 的内存,存储间接指针。
buf_[c][w][h][r] = 0.0;
buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r] = 0.0;
这两个相比,前者看起来更加干净。但是,我们很容易做做操作符重载
char& operator()(size_t c, size_t w, size_t h, size_t r){
return buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r];
}
这样我们就可以使用下面的语法。
(*this)(c,w,h,r) = 0.0;
这里淡淡的吐槽一下 c++ ,(*this)[c,w,h,r] = 0.0;
也许更好,可惜 c++
的对方括号的操作符重载,只允许一个参数。
如果非要用第一种语法形式 (*this)[c][w][h][r]
的话,(*this)[c]
返回一个对象,依次产生三个临时对象。
我们比较一下访问速度
#include <time.h>
#include <iostream>
using namespace std;
class Video {
public:
Video(size_t num_of_channels, size_t width, size_t height,
size_t num_of_colors)
: num_of_channels_(num_of_channels),
width_(width),
height_(height),
num_of_colors_(num_of_colors) {
buf_ = new char***[num_of_channels];
for (size_t c = 0; c < num_of_channels; ++c) {
buf_[c] = new char**[width];
for (size_t w = 0; w < width; ++w) {
buf_[c][w] = new char*[height];
for (size_t h = 0; h < height; ++h) {
buf_[c][w][h] = new char[num_of_colors];
for (size_t r = 0; r < num_of_colors; ++r) {
buf_[c][w][h][r] = r;
}
}
}
}
}
~Video() {
for (size_t c = 0; c < num_of_channels_; ++c) {
for (size_t w = 0; w < width_; ++w) {
for (size_t h = 0; h < height_; ++h) {
delete[] buf_[c][w][h];
}
delete[] buf_[c][w];
}
delete[] buf_[c];
}
delete[] buf_;
}
float SumOfFirstColumn();
float SumOfFirstRows();
private:
const size_t num_of_channels_;
const size_t width_;
const size_t height_;
const size_t num_of_colors_;
char **** buf_;
};
float Video::SumOfFirstColumn() {
float ret = 0.0f;
for (size_t c = 0; c < num_of_channels_; ++c) {
for (size_t w = 0; w < 1; ++w) {
for (size_t h = 0; h < height_; ++h) {
for (size_t r = 0; r < num_of_colors_; ++r) {
ret += buf_[c][w][h][r];
}
}
}
}
return ret;
}
float Video::SumOfFirstRows() {
float ret = 0.0f;
for (size_t c = 0; c < num_of_channels_; ++c) {
for (size_t w = 0; w < width_; ++w) {
for (size_t h = 0; h < 1; ++h) {
for (size_t r = 0; r < num_of_colors_; ++r) {
ret += buf_[c][w][h][r];
}
}
}
}
return ret;
}
int time_diff(const struct timespec& ts_end, const struct timespec& ts_start) {
return (ts_start.tv_sec - ts_end.tv_sec) * 1000 -
(ts_start.tv_nsec - ts_end.tv_nsec) / 1000;
}
int main(int argc, char *argv[])
{
Video v(10,1920,1080,3);
struct timespec ts_start;
struct timespec ts_end;
clock_gettime(CLOCK_MONOTONIC, &ts_start);
float r1 = v.SumOfFirstColumn();
clock_gettime(CLOCK_MONOTONIC, &ts_end);
cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
<< "r1 " << r1 << " "
<< " it takes " << time_diff(ts_end, ts_start) << " ms"
<< endl;
clock_gettime(CLOCK_MONOTONIC, &ts_start);
float r2 = v.SumOfFirstRows();
clock_gettime(CLOCK_MONOTONIC, &ts_end);
cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
<< "r2 " << r2 << " "
<< " it takes " << time_diff(ts_end, ts_start) << " ms"
<< endl;
return 0;
}
#include <time.h>
#include <iostream>
using namespace std;
class Video {
public:
Video(size_t num_of_channels, size_t width, size_t height,
size_t num_of_colors)
: num_of_channels_(num_of_channels),
width_(width),
height_(height),
num_of_colors_(num_of_colors) {
buf_ = new char[num_of_channels * width * height * num_of_colors];
for (size_t c = 0; c < num_of_channels; ++c) {
for (size_t w = 0; w < width; ++w) {
for (size_t h = 0; h < height; ++h) {
for (size_t r = 0; r < num_of_colors; ++r) {
buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r] = r;
}
}
}
}
}
~Video() {
delete[] buf_;
}
float SumOfFirstColumn();
float SumOfFirstRow();
private:
const size_t num_of_channels_;
const size_t width_;
const size_t height_;
const size_t num_of_colors_;
char * buf_;
};
float Video::SumOfFirstColumn() {
float ret = 0.0f;
for (size_t c = 0; c < num_of_channels_; ++c) {
for (size_t w = 0; w < 1; ++w) {
for (size_t h = 0; h < height_; ++h) {
for (size_t r = 0; r < num_of_colors_; ++r) {
ret += buf_[c * width_ * height_ * num_of_colors_ + w * height_ * num_of_colors_ + h * num_of_colors_ + r];
}
}
}
}
return ret;
}
float Video::SumOfFirstRow() {
float ret = 0.0f;
for (size_t c = 0; c < num_of_channels_; ++c) {
for (size_t w = 0; w < width_; ++w) {
for (size_t h = 0; h < 1; ++h) {
for (size_t r = 0; r < num_of_colors_; ++r) {
ret += buf_[c * width_ * height_ * num_of_colors_ + w * height_ * num_of_colors_ + h * num_of_colors_ + r];
}
}
}
}
return ret;
}
int time_diff(const struct timespec& ts_end, const struct timespec& ts_start) {
return (ts_start.tv_sec - ts_end.tv_sec) * 1000 -
(ts_start.tv_nsec - ts_end.tv_nsec) / 1000;
}
int main(int argc, char *argv[])
{
Video v(10,1920,1080,3);
struct timespec ts_start;
struct timespec ts_end;
clock_gettime(CLOCK_MONOTONIC, &ts_start);
float r1 = v.SumOfFirstColumn();
clock_gettime(CLOCK_MONOTONIC, &ts_end);
cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
<< "r1 " << r1 << " "
<< " it takes " << time_diff(ts_end, ts_start) << " ms"
<< endl;
clock_gettime(CLOCK_MONOTONIC, &ts_start);
float r2 = v.SumOfFirstRow();
clock_gettime(CLOCK_MONOTONIC, &ts_end);
cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
<< "r2 " << r2 << " "
<< " it takes " << time_diff(ts_end, ts_start) << " ms"
<< endl;
return 0;
}
% c++ -O3 -std=c++11 high_dim_array_1.cpp
% ./a.out
real 0m2.873s
user 0m2.637s
sys 0m0.226s
% c++ -O3 -std=c++11 high_dim_array_2.cpp
% ./a.out
real 0m0.004s
user 0m0.001s
sys 0m0.001s
% c++ -O3 -std=c++11 high_dim_array_3.cpp
% ./a.out
high_dim_array_3.cpp:91: [main] r1 32400 it takes 40 ms
high_dim_array_3.cpp:99: [main] r2 57600 it takes 868 ms
% c++ -O3 -std=c++11 high_dim_array_4.cpp
% ./a.out
high_dim_array_4.cpp:78: [main] r1 32400 it takes 34 ms
high_dim_array_4.cpp:86: [main] r2 57600 it takes 419 ms
SumOfFirstColumn
是连续访问内存,因为内存中,是一列一列像素连续存储的。
SumOfFirstRow
是非连续访问内存。
单独访问一个像素点的时候
- 方法1
buf_[c][w][h][r]
, 需要访问四次内存,才能把内存中的像素数据搬运到寄存器中运算。 - 方法2
buf_[c * width_ * height_ * num_of_colors_ + w * height_ * num_of_colors_ + h * num_of_colors_ + r];
,需要做 6 次乘法,3 次加法,一次内存访问,才能完成数据搬移。
从结果上看,数学运算的速度要快于内存访问。当连续内存访问的时候,内存 cache 的命中率比较高,数学运算的速度并不一定快于内存访问。当非连续内存访问的时候,内存访问的 cache 命中率较低,二者的差距就更加明显了。