GPU上の疑似乱数の評価

GPU乱数の紹介

むかしむかし、古の時代、GLSLにまだ noise が搭載されていなかった頃……。 GPU上、特にフラグメントシェーダ上で noise の代わりに使われていた関数がありました。

(参考: ランダムな値を返す関数 on GLSL )

float rand(vec2 co){
    return fract(sin(dot(co.xy ,vec2(12.9898,78.233))) * 43758.5453);
}

vec2 co はフラグメントシェーダで適当に座標を取った物だと思ってください(そういう入力が必要です) GPUの事情として座標は -1.0 <= x, y <= 1.0 となります。

dot は内積を取る関数で、dot(v1, v2) = v1.x * v2.x + v1.y * v2.y のことです。

つまりこの関数は、

操作を行います。

x が画面サイズに依存してちょっとずつ増加し、それが 1.0 を取るたびに y の値が増加しています。 なので裏には1つの自然数 n があって、そこから射影されて x, y が出てるようなものです。 ということは結局自然数から浮動小数点数 0.0 < r < 1.0 への関数と見做せます。つまり一応、疑似乱数チックな関数なのです。

是非、実際に挙動を見てみましょう。以下のページで裏に表示されているざらざらしたものがこの結果です。

GLSL Sandbox上に実装したGPU乱数

本当に適切な疑似乱数なの?

ところがこの乱数は 伝統的に使われてきた物 で、乱数としてどの程度妥当なのかわかっていないらしいのです。 (といっても分かってないのは私の仲間内の話なので、識者の方がいたら教えていただきたいです。)

ということで、これが疑似乱数としてどれくらい「まとも」なのか気になりました。

疑似乱数の妥当性をチェックする手法はいくつも提案されています。ここではメジャーな dieharder テストツールを使ってみることにしました。

floatからバイナリ列への変換(dieharderを使うために)

dieharder を使うためには、まず乱数列をバイナリの形で準備する必要があります。 しかしこの乱数エンジンはfloatで 0 < r < 1 になる値ですから、そのまま出力したら乱数として不適切です。 そこで float がIEEE 754の単精度で実装されていることを思い出しました。 仮数部の23ビットだけを取り出せば、これで適切なバイナリになるはずです

そこでCPU上で以下のコードを実装しました。

// GPUでよく使われてる謎の乱数生成エンジン
// 実際に乱数としてどんなものか試すのにビット列を出してみて
// dieherderに当ててみる

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

const size_t WRITE_BITS = 23;
// WRITE_BITSの倍数にしておけばちょうど書き込んだところで書きだせる
const size_t BUFFER_SIZE = (4*1024) * WRITE_BITS;

class GPURandomGen {
    size_t offset, width, height;

public:

    GPURandomGen(size_t width, size_t height, size_t offset)
        : width(width)
        , height(height)
        , offset(offset)
    { }

    inline float x() const { return static_cast<float>(offset % width) / width; }
    inline float y() const { return static_cast<float>(offset / width) / height; }

    unsigned int next() {
        const float x = this->x();
        const float y = this->y();

        ++offset;

        union Converter {
            float f;
            unsigned int i;
        };
        Converter conv;

        conv.f = sinf(12.9898 * x + 78.233* y) * 43758.5453;
        return conv.i & 0x007FFFFF;
    }
};

class WriteBuffer {
    unsigned char buffer[BUFFER_SIZE];
    size_t pos;
    size_t bit_pos;

    FILE* fp;

    void write_out() {
        if( pos == 0 ) return;
        fwrite(buffer, pos, 1, fp);
    }

public:

    WriteBuffer() : pos(0), bit_pos(0) {
        memset(&buffer, 0, BUFFER_SIZE);

        if( (fp = fopen("rand.dat", "wb+")) == NULL ) {
            throw "Exception: file cannot open.";
        }
    }

    virtual ~WriteBuffer() {
        fclose(fp);
    }

    inline size_t size() const { return BUFFER_SIZE; }

    void clear() {
        memset(&buffer, 0, BUFFER_SIZE);
        pos = 0;
        bit_pos = 0;
    }

    void flush() {
        write_out();
        clear();
    }

    void write(unsigned int data) {
        // TODO: 1bitずつ操作しないでもっと効率よくする
        for( size_t j = 0; j < WRITE_BITS; ++j ) {
            const unsigned int shift_size = WRITE_BITS - j;
            buffer[pos] |= ( (data >> shift_size) & 0x1 ) << bit_pos;

            ++bit_pos;
            if( bit_pos == 8 ) {
                ++pos;
                bit_pos = 0;
            }

            if( pos == BUFFER_SIZE ) {
                flush();
            }
        }
    }
};

int main(int argc, char* argv[]) {
    if( argc != 5 ) {
        printf("usage: %s <bytes> <width> <height> <offset>\n", argv[0]);
        return 1;
    }

    const long long bytes = atoll(argv[1]);
    const long long count = (bytes * 8) / WRITE_BITS;
    const size_t width = atoi(argv[2]);
    const size_t height = atoi(argv[3]);
    const size_t offset = atoi(argv[4]);

    GPURandomGen gen(width, height, offset);

    WriteBuffer buffer;

    size_t pos = 0;
    unsigned int bit_pos = 0;

    for( long long i = 0; i < count; ++i ) {
        buffer.write(gen.next());
    }
    buffer.flush();

    return 0;
}

dieharderに当ててみた

動かしてみましょう。 以下のようなパラメタで動かしました。 dieharder は15GBほどの乱数列を要するらしいので、bytes = 16106127360 = 15GiBytes (width, height) = (1024, 768) はよくあるゲームの解像度です。 offset に適切なパラメタは特に思いつかなかったので 0 です。

$ g++ rand_test.cpp
$ ./a.out
usage: ./a.out <bytes> <width> <height> <offset>
$ ./a.out 16106127360 1024 768 0

しばらく待って rand.dat が出来たので dieharder に食べさせたのが以下の結果です。

実行したコマンド

$ ls -l
total 15728664
-rwxrwxr-x 1 gaura gaura       14520 Apr  2 21:57 a.out
-rw-rw-r-- 1 gaura gaura 16106127358 Apr  2 22:17 rand.dat
-rw-r--r-- 1 gaura gaura        2980 Apr  2 21:57 rand_test.cpp
$ dieharder -a -g 201 -f rand.dat

結果

#=============================================================================#
#            dieharder version 3.31.1 Copyright 2003 Robert G. Brown          #
#=============================================================================#
   rng_name    |           filename             |rands/second|
 file_input_raw|                        rand.dat|  3.41e+07  |
#=============================================================================#
        test_name   |ntup| tsamples |psamples|  p-value |Assessment
#=============================================================================#
   diehard_birthdays|   0|       100|     100|0.60763200|  PASSED  
      diehard_operm5|   0|   1000000|     100|0.00000000|  FAILED  
  diehard_rank_32x32|   0|     40000|     100|0.45642073|  PASSED  
    diehard_rank_6x8|   0|    100000|     100|0.00000000|  FAILED  
   diehard_bitstream|   0|   2097152|     100|0.00000000|  FAILED  
        diehard_opso|   0|   2097152|     100|0.00000000|  FAILED  
        diehard_oqso|   0|   2097152|     100|0.00000000|  FAILED  
         diehard_dna|   0|   2097152|     100|0.00000000|  FAILED  
diehard_count_1s_str|   0|    256000|     100|0.00000000|  FAILED  
diehard_count_1s_byt|   0|    256000|     100|0.00000000|  FAILED  
 diehard_parking_lot|   0|     12000|     100|0.00000000|  FAILED  
    diehard_2dsphere|   2|      8000|     100|0.00000000|  FAILED  
    diehard_3dsphere|   3|      4000|     100|0.00000000|  FAILED  
     diehard_squeeze|   0|    100000|     100|0.00000000|  FAILED  
        diehard_sums|   0|       100|     100|0.00000000|  FAILED  
        diehard_runs|   0|    100000|     100|0.00000000|  FAILED  
        diehard_runs|   0|    100000|     100|0.00000000|  FAILED  
       diehard_craps|   0|    200000|     100|0.00000000|  FAILED  
       diehard_craps|   0|    200000|     100|0.00000000|  FAILED  
 marsaglia_tsang_gcd|   0|  10000000|     100|0.00000000|  FAILED  
 marsaglia_tsang_gcd|   0|  10000000|     100|0.00000000|  FAILED  
         sts_monobit|   1|    100000|     100|0.00000000|  FAILED  
            sts_runs|   2|    100000|     100|0.00000000|  FAILED  
          sts_serial|   1|    100000|     100|0.00000000|  FAILED  
          sts_serial|   2|    100000|     100|0.00000000|  FAILED  
          sts_serial|   3|    100000|     100|0.00000000|  FAILED  
          sts_serial|   3|    100000|     100|0.00000000|  FAILED  
          sts_serial|   4|    100000|     100|0.00000000|  FAILED  
          sts_serial|   4|    100000|     100|0.00000000|  FAILED  
          sts_serial|   5|    100000|     100|0.00000000|  FAILED  
          sts_serial|   5|    100000|     100|0.00000000|  FAILED  
          sts_serial|   6|    100000|     100|0.00000000|  FAILED  
          sts_serial|   6|    100000|     100|0.00000000|  FAILED  
          sts_serial|   7|    100000|     100|0.00000000|  FAILED  
          sts_serial|   7|    100000|     100|0.00000000|  FAILED  
          sts_serial|   8|    100000|     100|0.00000000|  FAILED  
          sts_serial|   8|    100000|     100|0.00000000|  FAILED  
          sts_serial|   9|    100000|     100|0.00000000|  FAILED  
          sts_serial|   9|    100000|     100|0.00000000|  FAILED  
          sts_serial|  10|    100000|     100|0.00000000|  FAILED  
          sts_serial|  10|    100000|     100|0.00000000|  FAILED  
          sts_serial|  11|    100000|     100|0.00000000|  FAILED  
          sts_serial|  11|    100000|     100|0.00000000|  FAILED  
          sts_serial|  12|    100000|     100|0.00000000|  FAILED  
          sts_serial|  12|    100000|     100|0.00000000|  FAILED  
          sts_serial|  13|    100000|     100|0.00000000|  FAILED  
          sts_serial|  13|    100000|     100|0.00000000|  FAILED  
          sts_serial|  14|    100000|     100|0.00000000|  FAILED  
          sts_serial|  14|    100000|     100|0.00000000|  FAILED  
          sts_serial|  15|    100000|     100|0.00000000|  FAILED  
          sts_serial|  15|    100000|     100|0.00000000|  FAILED  
          sts_serial|  16|    100000|     100|0.00000000|  FAILED  
          sts_serial|  16|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   1|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   2|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   3|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   4|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   5|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   6|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   7|    100000|     100|0.00000000|  FAILED  
         rgb_bitdist|   8|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
         rgb_bitdist|   9|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
         rgb_bitdist|  10|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
         rgb_bitdist|  11|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
         rgb_bitdist|  12|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
rgb_minimum_distance|   2|     10000|    1000|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
rgb_minimum_distance|   3|     10000|    1000|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
rgb_minimum_distance|   4|     10000|    1000|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
rgb_minimum_distance|   5|     10000|    1000|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
    rgb_permutations|   2|    100000|     100|0.00000033|  FAILED  
# The file file_input_raw was rewound 1 times
    rgb_permutations|   3|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
    rgb_permutations|   4|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
    rgb_permutations|   5|    100000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
      rgb_lagged_sum|   0|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
      rgb_lagged_sum|   1|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
      rgb_lagged_sum|   2|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
      rgb_lagged_sum|   3|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
      rgb_lagged_sum|   4|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
      rgb_lagged_sum|   5|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 1 times
      rgb_lagged_sum|   6|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 2 times
      rgb_lagged_sum|   7|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 2 times
      rgb_lagged_sum|   8|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 2 times
      rgb_lagged_sum|   9|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 2 times
      rgb_lagged_sum|  10|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 3 times
      rgb_lagged_sum|  11|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 3 times
      rgb_lagged_sum|  12|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 3 times
      rgb_lagged_sum|  13|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 4 times
      rgb_lagged_sum|  14|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 4 times
      rgb_lagged_sum|  15|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 5 times
      rgb_lagged_sum|  16|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 5 times
      rgb_lagged_sum|  17|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 5 times
      rgb_lagged_sum|  18|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 6 times
      rgb_lagged_sum|  19|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 7 times
      rgb_lagged_sum|  20|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 7 times
      rgb_lagged_sum|  21|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 8 times
      rgb_lagged_sum|  22|   1000000|     100|0.00000001|  FAILED  
# The file file_input_raw was rewound 8 times
      rgb_lagged_sum|  23|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 9 times
      rgb_lagged_sum|  24|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 9 times
      rgb_lagged_sum|  25|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 10 times
      rgb_lagged_sum|  26|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 11 times
      rgb_lagged_sum|  27|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 12 times
      rgb_lagged_sum|  28|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 12 times
      rgb_lagged_sum|  29|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 13 times
      rgb_lagged_sum|  30|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 14 times
      rgb_lagged_sum|  31|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 15 times
      rgb_lagged_sum|  32|   1000000|     100|0.00000000|  FAILED  
# The file file_input_raw was rewound 15 times
     rgb_kstest_test|   0|     10000|    1000|0.00000000|  FAILED  
# The file file_input_raw was rewound 15 times
     dab_bytedistrib|   0|  51200000|       1|0.00000000|  FAILED  
# The file file_input_raw was rewound 15 times
             dab_dct| 256|     50000|       1|0.00000000|  FAILED  
Preparing to run test 207.  ntuple = 0
# The file file_input_raw was rewound 15 times
        dab_filltree|  32|  15000000|       1|0.00000000|  FAILED  
        dab_filltree|  32|  15000000|       1|0.00000000|  FAILED  
Preparing to run test 208.  ntuple = 0
# The file file_input_raw was rewound 15 times
       dab_filltree2|   0|   5000000|       1|0.00000000|  FAILED  
       dab_filltree2|   1|   5000000|       1|0.00000000|  FAILED  
Preparing to run test 209.  ntuple = 0
# The file file_input_raw was rewound 15 times
        dab_monobit2|  12|  65000000|       1|1.00000000|  FAILED

考察

ということでまあ、案の定なんですがほとんどすべてのテストに落ちてますね……。

ということで本当に疑似乱数としての品質はあまりよろしくありません。 素直に最新のGLSLではnoiseを使いましょう。そうじゃない場合は見栄えの問題ですから、たいていは大丈夫かもしれないと思います。 ただたとえばメトロポリス光輸送のようなガチガチの確率的レンダリングアルゴリズムを使うと変な効果が出るかもしれません。

まとめ