とりあえず貼ってみる

Hack the Cell 2009、提出しなかったけど(ぉぃ)とりあえずやったので貼ってみる。

/*
   Copyright (C) 2009, A.H.
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:

     1. Redistributions of source code must retain the above copyright
        notice, this list of conditions and the following disclaimer.

     2. Redistributions in binary form must reproduce the above copyright
        notice, this list of conditions and the following disclaimer in the
        documentation and/or other materials provided with the distribution.

     3. The names of its contributors may not be used to endorse or promote 
        products derived from this software without specific prior written 
        permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER "AS IS" AND ANY
   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER BE LIABLE FOR ANY
   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <stdio.h>
#include <spu_mfcio.h>
#include "mt_mine.h"

/* Period parameters */
#define N 624
#define M 397
#define NV (N/4)
#define MV (M/4)
#define NA (NV*16)
#define MA (MV*16)
#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

typedef          signed char        int8_t;
typedef        unsigned char       uint8_t;
typedef          signed int        int32_t;
typedef        unsigned int       uint32_t;
typedef vector   signed char       int8v_t;
typedef vector unsigned char      uint8v_t;
typedef vector   signed int       int32v_t;
typedef vector unsigned int      uint32v_t;

//  端数処理の際の index 処理を簡素化するため、2倍の領域を利用
static uint32_t mt[N*2] __attribute__((aligned(16)));
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

/* initializes mt[N] with a seed */
void init_genrand_mine(unsigned long s)
{
  mt[0]= s & 0xffffffffUL;
  for (mti=1; mti<N; mti++) {
    mt[mti] =
      (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
    /* In the previous versions, MSBs of the seed affect   */
    /* only MSBs of the array mt[].                        */
    /* 2002/01/09 modified by Makoto Matsumoto             */
    mt[mti] &= 0xffffffffUL;
    /* for >32 bit machines */
  }
}

uint32_t matrixes[] = {
    0x00000000, 0x00000000, 0x00000000, 0x00000000,
    0x00000000, 0x00000000, 0x00000000, 0x9908b0df,
    0x00000000, 0x00000000, 0x9908b0df, 0x00000000,
    0x00000000, 0x00000000, 0x9908b0df, 0x9908b0df,
    0x00000000, 0x9908b0df, 0x00000000, 0x00000000,
    0x00000000, 0x9908b0df, 0x00000000, 0x9908b0df,
    0x00000000, 0x9908b0df, 0x9908b0df, 0x00000000,
    0x00000000, 0x9908b0df, 0x9908b0df, 0x9908b0df,
    0x9908b0df, 0x00000000, 0x00000000, 0x00000000,
    0x9908b0df, 0x00000000, 0x00000000, 0x9908b0df,
    0x9908b0df, 0x00000000, 0x9908b0df, 0x00000000,
    0x9908b0df, 0x00000000, 0x9908b0df, 0x9908b0df,
    0x9908b0df, 0x9908b0df, 0x00000000, 0x00000000,
    0x9908b0df, 0x9908b0df, 0x00000000, 0x9908b0df,
    0x9908b0df, 0x9908b0df, 0x9908b0df, 0x00000000,
    0x9908b0df, 0x9908b0df, 0x9908b0df, 0x9908b0df,
};

// global 変数にすることで最適化を抑制
// static 化するとレジスタを消費するため遅くなる
uint32v_t *vmt = (uint32v_t*)mt;
const uint32v_t *cvmt = (uint32v_t*)mt;
// static 変数にすることで最適化(初期化用)
static uint32v_t *const svmt = (uint32v_t*)mt;

#define NL      7               // 分割ステップ数
#define ND      3               // STEP01 から STEP04 までの遅延分
#define NP      3               // 事前処理ステップ数
#define NF      (NP-ND)         // loop 初回の table offset
#define NE      (NL-NP)         // 事後処理ステップ数
#define NDA     (ND*16)         // STEP01F での table offset
#define NIDX    (-NA+(NF*16))   // STEP01F/04F 用 index 初期値

#define spu_lqx(ra,rb) *(uint32v_t*)( ra + (int32_t)spu_extract( rb, 0 ) )
#define expect(flag,dir) __builtin_expect( (flag), dir )
#define likely(flag) __builtin_expect( (flag), 1 )
#define unlikely(flag) __builtin_expect( (flag), 0 )

unsigned int genrand_mine( int num_rand ) {
    uint8_t * const mats = (uint8_t*)matrixes;
    uint8_t * const mts = (uint8_t*)mt + NA;
    uint8_t * const mte = (uint8_t*)mt + NA + NA;
    uint8_t * const mt1 = (uint8_t*)mt + NA + NDA;
    uint8_t * const mtM = (uint8_t*)mt + NA + MA + NDA;
    const uint32v_t zero = { 0 };
    const uint32v_t y07_mask = spu_splats( (uint32_t)0x9d2c5680 );
    const uint32v_t y15_mask = spu_splats( (uint32_t)0xefc60000 );
    const uint32v_t upper_mask = spu_splats( (uint32_t)0x80000000 );
    const uint8v_t ptn = { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 };
    volatile int t = 0;     // 最適化抑制用
    uint32v_t idx = spu_splats( (uint32_t)(NIDX) ); // STEP01F/04F での table index
    uint32v_t kk1a, kkMa, kv1a, kvMa, rlma, ygba, adra, maga, suma;
    uint32v_t kk1b, kkMb, kv1b, kvMb, rlmb, ygbb, adrb, magb, sumb;
    uint32v_t y01a, y02a, y03a, y04a, y05a, y06a, y07a, y08a, y09a, y10a, y11a, y12a, y13a, y14a;
    uint32v_t y01b, y02b, y03b, y04b, y05b, y06b, y07b, y08b, y09b, y10b, y11b, y12b, y13b, y14b;
    kk1a = kk1b = svmt[0];
    kkMa = kkMb = svmt[MV];
    y01a = y02a = y03a = y04a = y05a = y06a = y07a = zero;
    y01b = y02b = y03b = y04b = y05b = y06b = y07b = zero;
    y08a = y09a = y10a = y11a = y12a = y13a = y14a = suma = zero;
    y08b = y09b = y10b = y11b = y12b = y13b = y14b = sumb = zero;

//  SIMD 化した更新処理(13cycle)を 7step に分割 (step 間の変数依存を調整)
//  逆順ループ(07 to 01)を7回実行すると答えが出るようにし最適化を推進
//  変数を x/y 2面持つことで 2 ROUND 間でも依存関係をなくし最適化を推進
#   define STEP01(x,y,n) \
     kk1##x = cvmt[ ( (n) + 1 + NP ) % NV ]; \
     kkM##x = cvmt[ ( (n) + 1 + NP + MV ) % NV ]; \
     kv1##x = spu_shuffle( kk1##y, kk1##x, ptn ); \
     kvM##x = spu_shuffle( kkM##y, kkM##x, ptn ); \

#   define STEP02(x,y) \
    y01##x = spu_sel( kv1##x, kk1##y, upper_mask ); \
     ygb##x = spu_gather( kv1##x ); \
    rlm##x = spu_rlmask( y01##x, -1 ); \
     adr##x = spu_slqw( ygb##x, 4 ); \
    y02##x = spu_xor( rlm##x, kvM##x ); \

#   define STEP03(x) \
     mag##x = spu_lqx( mats, adr##x ); \
    y03##x = spu_xor( y02##x, mag##x ); \

#   define STEP04(x,n) \
    y04##x = spu_rlmask( y03##x, -11 ); \
    y05##x = spu_xor( y03##x, y04##x ); \
     y06##x = spu_slqw( y05##x, 7 ); \
     vmt[ ( (n) + NF ) % NV ] = y03##x; \

#   define STEP05(x) \
    y07##x = spu_and( y06##x, y07_mask ); \
    y08##x = spu_xor( y05##x, y07##x ); \
     y09##x = spu_slqw( y08##x, 15 ); \

#   define STEP06(x) \
     y10##x = spu_slqwbytebc( y09##x, 15 ); \
    y11##x = spu_and( y10##x, y15_mask ); \
    y12##x = spu_xor( y08##x, y11##x ); \

#   define STEP07(x,y) \
    y13##x = spu_rlmask( y12##x, -18 ); \
    y14##x = spu_xor( y12##x, y13##x ); \
    sum##x = spu_add( sum##y, y14##x ); \

//  上記 STEP01/04 の固定値による table access を変数参照化
#   define STEP01F(x,y) \
    idx = spu_add( idx, 16 ); \
     kk1##x = *(uint32v_t*)( mt1 + (int32_t)spu_extract( idx, 0 ) ); \
     kkM##x = *(uint32v_t*)( mtM + (int32_t)spu_extract( idx, 0 ) ); \
     kv1##x = spu_shuffle( kk1##y, kk1##x, ptn ); \
     kvM##x = spu_shuffle( kkM##y, kkM##x, ptn ); \

#   define STEP04F(x) \
    y04##x = spu_rlmask( y03##x, -11 ); \
    y05##x = spu_xor( y03##x, y04##x ); \
     y06##x = spu_slqw( y05##x, 7 ); \
     *(uint32v_t*)( mts + (int32_t)spu_extract( idx, 0 ) ) = y03##x; \
     *(uint32v_t*)( mte + (int32_t)spu_extract( idx, 0 ) ) = y03##x; \

//  事前処理する部分をマクロ化 (3step)
#   define PROLOGUE \
    STEP01(b,a,-NP+0) \
    STEP02(b,a) STEP01(a,b,-NP+1) \
    STEP03(b)   STEP02(a,b) STEP01(b,a,-NP+2) \

//  各 STEP を逆順に実行することにより、依存関係なく最適化
#   define ROUNDa(n) \
    { STEP07(a,b) STEP06(b) STEP05(a) STEP04(b,n) STEP03(a) STEP02(b,a) STEP01(a,b,n) }
#   define ROUNDb(n) \
    { STEP07(b,a) STEP06(a) STEP05(b) STEP04(a,n) STEP03(b) STEP02(a,b) STEP01(b,a,n) }

//  端数処理のための変数参照による更新処理
#   define FRACTIONa \
    { STEP07(a,b) STEP06(b) STEP05(a) STEP04F(b) STEP03(a) STEP02(b,a) STEP01F(a,b) }
#   define FRACTIONb \
    { STEP07(b,a) STEP06(a) STEP05(b) STEP04F(a) STEP03(b) STEP02(a,b) STEP01F(b,a) }

//  最適化抑制のため、volatile 変数を参照した branch
//  ※ spu-gcc 4.1.1 にて。他の version の場合は他の回避策で。
#   define RBREAK { if ( unlikely( t ) ) break; }

//  mt[624] の更新処理をまとめたマクロ (最終段のみ最適化抑制コードを除外)
#   define ROUND2(n) { ROUNDa((n)*2+0) ROUNDb((n)*2+1) }
#   define ROUND2R(n) { ROUND2(n) RBREAK }
#   define ROUND2L(n) { ROUND2(n) }
#   define ROUND6R(n) { ROUND2R((n)*3+0) ROUND2R((n)*3+1) ROUND2R((n)*3+2) }
#   define ROUND6L(n) { ROUND2R((n)*3+0) ROUND2R((n)*3+1) ROUND2L((n)*3+2) }
#   define R(n) { ROUND6R((n)*2+0) ROUND6R((n)*2+1) }
#   define L(n) { ROUND6R((n)*2+0) ROUND6L((n)*2+1) }
#   define ROUNDS { R(0) R(1) R(2) R(3) R(4) R(5) R(6) R(7) R(8) R(9) R(10) R(11) L(12) }

    num_rand = ( num_rand >> 2 ) + NE;      // SIMD 化、事後処理分を調整
    uint32_t num_main = num_rand / NV;
    uint32_t num_remain = num_rand % NV;

    PROLOGUE
    // main loop. 奇数回の場合は真ん中で break.
    uint32v_t count = spu_splats( num_main );
    do {
        ROUNDS
        uint32v_t check = spu_rlmaskqw( count, -1 );
        if ( unlikely( !spu_extract( check, 0 ) ) ) break;
        ROUNDS
        count = spu_add( count, -2 );
    } while ( likely( spu_extract( count, 0 ) ) );

    // 端数処理。4個単位で処理し、break と事後処理で結果を取捨選択
    count = spu_splats( num_remain >> 1 );
    while ( likely( spu_extract( count, 0 ) ) ) {
        FRACTIONa FRACTIONb
        uint32v_t check = spu_rlmaskqw( count, -1 );
        if ( unlikely( !( spu_extract( check, 0 ) ) ) ) break;
        FRACTIONa FRACTIONb
        count = spu_add( count, -2 );
    }
    uint32v_t sumv = !( num_remain & 1 ) ? suma : sumb;

    // SIMD 化した4要素を加算
    unsigned int sum = spu_extract( sumv, 0 );
    sum += spu_extract( sumv, 1 );
    sum += spu_extract( sumv, 2 );
    sum += spu_extract( sumv, 3 );
    return sum;
}