c++ 中的高维数组(2)

数值计算中,常常需要操作高维数组,C++ 很容易写出来占用内存多,而且速度慢的程序。

#include <iostream>
using namespace std;

class Video {
  public:
   Video(size_t num_of_channels, size_t width, size_t height,
         size_t num_of_colors)
       : num_of_channels_(num_of_channels),
         width_(width),
         height_(height),
         num_of_colors_(num_of_colors) {
       buf_ = new char***[num_of_channels];
       for (size_t c = 0; c < num_of_channels; ++c) {
           buf_[c] = new char**[width];
           for (size_t w = 0; w < width; ++w) {
               buf_[c][w] = new char*[height];
               for (size_t h = 0; h < height; ++h) {
                   buf_[c][w][h] = new char[num_of_colors];
                   for (size_t r = 0; r < num_of_colors; ++r) {
                       buf_[c][w][h][r] = 0.0;
                   }
               }
           }
       }
    }
    ~Video() {
        for (size_t c = 0; c < num_of_channels_; ++c) {
            for (size_t w = 0; w < width_; ++w) {
                for (size_t h = 0; h < height_; ++h) {
                    delete[] buf_[c][w][h];
                }
                delete[] buf_[c][w];
            }
            delete[] buf_[c];
        }
        delete[] buf_;
    }

   private:
    const size_t num_of_channels_;
    const size_t width_;
    const size_t height_;
    const size_t num_of_colors_;
    char **** buf_;
};


int main(int argc, char *argv[])
{
    Video v(10,1920,1080,3);
    return 0;
}

我碰到过一些真实的,代码量很大的项目,至少三次看到上面这种模式。这种方法占用内存多,速度慢。

可以用一维数组模拟高维数组。

#include <iostream>
using namespace std;

class Video {
  public:
   Video(size_t num_of_channels, size_t width, size_t height,
         size_t num_of_colors)
       : num_of_channels_(num_of_channels),
         width_(width),
         height_(height),
         num_of_colors_(num_of_colors) {
       buf_ = new char[num_of_channels * width * height * num_of_colors];
       for (size_t c = 0; c < num_of_channels; ++c) {
           for (size_t w = 0; w < width; ++w) {
               for (size_t h = 0; h < height; ++h) {
                   for (size_t r = 0; r < num_of_colors; ++r) {
                       buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r] = 0.0;
                   }
               }
           }
       }
    }
    ~Video() {
        delete[] buf_;
    }

   private:
    const size_t num_of_channels_;
    const size_t width_;
    const size_t height_;
    const size_t num_of_colors_;
    char * buf_;
};


int main(int argc, char *argv[])
{
    Video v(10,1920,1080,3);
    return 0;
}

我们看看速度

% c++ -O3 -std=c++11 high_dim_array_1.cpp
% ./a.out

real	0m2.873s
user	0m2.637s
sys	0m0.226s
+ c++ -O3 -std=c++11 high_dim_array_2.cpp
+ ./a.out

real	0m0.004s
user	0m0.001s
sys	0m0.001s

和上一篇文章里面不同,经过优化开关的代码,二者差距更大。

我们分析一下内存使用。对于数据来说,二者是一样的。但是第一种方案,额外存储了很多内存指针。

  1. new char***[num_of_channels]; 需要 10 个指针。
  2. 同样道理,宽度层,需要 10 * 1920 个指针
  3. 高度层,需要 10*1920*1080 个指针
  4. 最后一层,没有额外的指针空间,因为这些就是原始数据了。

可以看到,每个指针 8 个字节(64位机器),我们额外需要大约 160M+ 的内存,存储间接指针。

buf_[c][w][h][r] = 0.0;
buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r] = 0.0;

这两个相比,前者看起来更加干净。但是,我们很容易做做操作符重载

char& operator()(size_t c, size_t w, size_t h, size_t r){
   return buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r];
}

这样我们就可以使用下面的语法。

 (*this)(c,w,h,r) = 0.0;

这里淡淡的吐槽一下 c++ ,(*this)[c,w,h,r] = 0.0; 也许更好,可惜 c++ 的对方括号的操作符重载,只允许一个参数。

如果非要用第一种语法形式 (*this)[c][w][h][r] 的话,(*this)[c] 返回一个对象,依次产生三个临时对象。

我们比较一下访问速度

#include <time.h>

#include <iostream>
using namespace std;

class Video {
  public:
   Video(size_t num_of_channels, size_t width, size_t height,
         size_t num_of_colors)
       : num_of_channels_(num_of_channels),
         width_(width),
         height_(height),
         num_of_colors_(num_of_colors) {
       buf_ = new char***[num_of_channels];
       for (size_t c = 0; c < num_of_channels; ++c) {
           buf_[c] = new char**[width];
           for (size_t w = 0; w < width; ++w) {
               buf_[c][w] = new char*[height];
               for (size_t h = 0; h < height; ++h) {
                   buf_[c][w][h] = new char[num_of_colors];
                   for (size_t r = 0; r < num_of_colors; ++r) {
                       buf_[c][w][h][r] = r;
                   }
               }
           }
       }
    }
    ~Video() {
        for (size_t c = 0; c < num_of_channels_; ++c) {
            for (size_t w = 0; w < width_; ++w) {
                for (size_t h = 0; h < height_; ++h) {
                    delete[] buf_[c][w][h];
                }
                delete[] buf_[c][w];
            }
            delete[] buf_[c];
        }
        delete[] buf_;
    }

    float SumOfFirstColumn();
    float SumOfFirstRows();
   private:
    const size_t num_of_channels_;
    const size_t width_;
    const size_t height_;
    const size_t num_of_colors_;
    char **** buf_;
};

float Video::SumOfFirstColumn() {
    float ret = 0.0f;
    for (size_t c = 0; c < num_of_channels_; ++c) {
        for (size_t w = 0; w < 1; ++w) {
            for (size_t h = 0; h < height_; ++h) {
                for (size_t r = 0; r < num_of_colors_; ++r) {
                    ret += buf_[c][w][h][r];
                }
            }
        }
    }
    return ret;
}
float Video::SumOfFirstRows() {
    float ret = 0.0f;
    for (size_t c = 0; c < num_of_channels_; ++c) {
        for (size_t w = 0; w < width_; ++w) {
            for (size_t h = 0; h < 1; ++h) {
                for (size_t r = 0; r < num_of_colors_; ++r) {
                    ret += buf_[c][w][h][r];
                }
            }
        }
    }
    return ret;
}

int time_diff(const struct timespec& ts_end, const struct timespec& ts_start) {
    return (ts_start.tv_sec - ts_end.tv_sec) * 1000 -
           (ts_start.tv_nsec - ts_end.tv_nsec) / 1000;
}
int main(int argc, char *argv[])
{
    Video v(10,1920,1080,3);
    struct timespec ts_start;
    struct timespec ts_end;

    clock_gettime(CLOCK_MONOTONIC, &ts_start);
    float r1 = v.SumOfFirstColumn();
    clock_gettime(CLOCK_MONOTONIC, &ts_end);
    cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
         << "r1 " << r1 << " "
         << " it takes " << time_diff(ts_end, ts_start) << " ms"
         << endl;

    clock_gettime(CLOCK_MONOTONIC, &ts_start);
    float r2 = v.SumOfFirstRows();
    clock_gettime(CLOCK_MONOTONIC, &ts_end);
    cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
         << "r2 " << r2 << " "
         << " it takes " << time_diff(ts_end, ts_start) << " ms"
         << endl;

    return 0;
}
#include <time.h>

#include <iostream>
using namespace std;

class Video {
  public:
   Video(size_t num_of_channels, size_t width, size_t height,
         size_t num_of_colors)
       : num_of_channels_(num_of_channels),
         width_(width),
         height_(height),
         num_of_colors_(num_of_colors) {
       buf_ = new char[num_of_channels * width * height * num_of_colors];
       for (size_t c = 0; c < num_of_channels; ++c) {
           for (size_t w = 0; w < width; ++w) {
               for (size_t h = 0; h < height; ++h) {
                   for (size_t r = 0; r < num_of_colors; ++r) {
                       buf_[c*width*height*num_of_colors + w*height*num_of_colors + h*num_of_colors + r] = r;
                   }
               }
           }
       }
    }
    ~Video() {
        delete[] buf_;
    }
    float SumOfFirstColumn();
    float SumOfFirstRow();
   private:
    const size_t num_of_channels_;
    const size_t width_;
    const size_t height_;
    const size_t num_of_colors_;
    char * buf_;
};

float Video::SumOfFirstColumn() {
    float ret = 0.0f;
    for (size_t c = 0; c < num_of_channels_; ++c) {
        for (size_t w = 0; w < 1; ++w) {
            for (size_t h = 0; h < height_; ++h) {
                for (size_t r = 0; r < num_of_colors_; ++r) {
                    ret += buf_[c * width_ * height_ * num_of_colors_ + w * height_ * num_of_colors_ + h * num_of_colors_ + r];
                }
            }
        }
    }
    return ret;
}
float Video::SumOfFirstRow() {
    float ret = 0.0f;
    for (size_t c = 0; c < num_of_channels_; ++c) {
        for (size_t w = 0; w < width_; ++w) {
            for (size_t h = 0; h < 1; ++h) {
                for (size_t r = 0; r < num_of_colors_; ++r) {
                    ret += buf_[c * width_ * height_ * num_of_colors_ + w * height_ * num_of_colors_ + h * num_of_colors_ + r];
                }
            }
        }
    }
    return ret;
}

int time_diff(const struct timespec& ts_end, const struct timespec& ts_start) {
    return (ts_start.tv_sec - ts_end.tv_sec) * 1000 -
           (ts_start.tv_nsec - ts_end.tv_nsec) / 1000;
}
int main(int argc, char *argv[])
{
    Video v(10,1920,1080,3);
    struct timespec ts_start;
    struct timespec ts_end;

    clock_gettime(CLOCK_MONOTONIC, &ts_start);
    float r1 = v.SumOfFirstColumn();
    clock_gettime(CLOCK_MONOTONIC, &ts_end);
    cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
         << "r1 " << r1 << " "
         << " it takes " << time_diff(ts_end, ts_start) << " ms"
         << endl;

    clock_gettime(CLOCK_MONOTONIC, &ts_start);
    float r2 = v.SumOfFirstRow();
    clock_gettime(CLOCK_MONOTONIC, &ts_end);
    cout << __FILE__ << ":" << __LINE__ << ": [" << __FUNCTION__ << "] "
         << "r2 " << r2 << " "
         << " it takes " << time_diff(ts_end, ts_start) << " ms"
         << endl;
return 0;
}
% c++ -O3 -std=c++11 high_dim_array_1.cpp
% ./a.out

real    0m2.873s
user    0m2.637s
sys 0m0.226s
% c++ -O3 -std=c++11 high_dim_array_2.cpp
% ./a.out

real    0m0.004s
user    0m0.001s
sys 0m0.001s
% c++ -O3 -std=c++11 high_dim_array_3.cpp
% ./a.out
high_dim_array_3.cpp:91: [main] r1 32400  it takes 40 ms
high_dim_array_3.cpp:99: [main] r2 57600  it takes 868 ms
% c++ -O3 -std=c++11 high_dim_array_4.cpp
% ./a.out
high_dim_array_4.cpp:78: [main] r1 32400  it takes 34 ms
high_dim_array_4.cpp:86: [main] r2 57600  it takes 419 ms

SumOfFirstColumn 是连续访问内存,因为内存中,是一列一列像素连续存储的。

SumOfFirstRow 是非连续访问内存。

单独访问一个像素点的时候

  • 方法1 buf_[c][w][h][r], 需要访问四次内存,才能把内存中的像素数据搬运到寄存器中运算。
  • 方法2 buf_[c * width_ * height_ * num_of_colors_ + w * height_ * num_of_colors_ + h * num_of_colors_ + r];,需要做 6 次乘法,3 次加法,一次内存访问,才能完成数据搬移。

从结果上看,数学运算的速度要快于内存访问。当连续内存访问的时候,内存 cache 的命中率比较高,数学运算的速度并不一定快于内存访问。当非连续内存访问的时候,内存访问的 cache 命中率较低,二者的差距就更加明显了。