Loading ...
Sorry, an error occurred while loading the content.
 

Re: [Csnd] 31 bit pseudo-random number gen in C, C++ & dsPIC assembly code

Expand Messages
  • Istvan Varga
    Here is a comparison of the time it takes to generate 100,000,000 random numbers with three different generators, compiled with various compilers on Linux.
    Message 1 of 25 , Sep 27, 2005
      Here is a comparison of the time it takes to generate 100,000,000 random
      numbers with three different generators, compiled with various compilers
      on Linux.

      rand31: linear congruential generator (m = 0x7FFFFFFF, a = 742938285)
      rand: ANSI C rand()
      (note that it is much slower if not compiled with -static)
      MT: Mersenne Twister (MT19937)

      ------------------------------------------------------------------------

      GCC 4.1 beta (-O2 -march=pentium3 -static)

      rand31: 1.703, rand: 4.559, MT: 2.501

      GCC 4.1 beta (-O2 -march=pentium3 -fomit-frame-pointer -static)

      rand31: 1.731, rand: 4.561, MT: 2.321

      GCC 4.1 beta (-O2 -march=pentium3)

      rand31: 1.702, rand: 7.004, MT: 2.500

      GCC 4.0.0 (-O2 -march=pentium3 -static)

      rand31: 1.702, rand: 4.561, MT: 2.405

      GCC 4.0.0 (-O2 -march=pentium3 -fomit-frame-pointer -static)

      rand31: 1.732, rand: 4.562, MT: 2.339

      GCC 3.3.4 (-O2 -march=pentium3 -static)

      rand31: 1.863, rand: 4.172, MT: 2.579

      GCC 3.3.4 (-O2 -march=pentium3 -fomit-frame-pointer -static)

      rand31: 1.730, rand: 4.172, MT: 2.385

      ICC 9.0 (-O2 -xK -static)

      rand31: 1.841, rand: 4.410, MT: 2.474

      ------------------------------------------------------------------------

      #include <stdio.h>
      #include <stdlib.h>
      #include <stdint.h>
      #include <sys/time.h>

      #define CS_PURE __attribute__ ((__pure__))
      #define CS_NOINLINE __attribute__ ((__noinline__))
      #define PUBLIC

      typedef struct CsoundRandMTState_ {
      int mti;
      uint32_t mt[624];
      } CsoundRandMTState;

      PUBLIC CS_PURE int csoundRand31(int seedVal)
      {
      uint64_t tmp1;
      uint32_t tmp2;

      /* x = (742938285 * x) % 0x7FFFFFFF */
      tmp1 = (uint64_t) ((int32_t) seedVal * (int64_t) 742938285);
      tmp2 = (uint32_t) tmp1 & (uint32_t) 0x7FFFFFFF;
      tmp2 += (uint32_t) (tmp1 >> 31);
      if ((int32_t) tmp2 < (int32_t) 0)
      tmp2 = (tmp2 + (uint32_t) 1) & (uint32_t) 0x7FFFFFFF;
      return (int) tmp2;
      }

      /* Period parameters */

      #define N 624
      #define M 397
      #define MATRIX_A 0x9908B0DFU /* constant vector a */
      #define UPPER_MASK 0x80000000U /* most significant w-r bits */
      #define LOWER_MASK 0x7FFFFFFFU /* least significant r bits */

      /* initializes mt[N] with a seed */

      PUBLIC void csoundSeedRandMT(CsoundRandMTState *p, uint32_t seedVal)
      {
      int i;
      uint32_t x;

      p->mt[0] = x = seedVal;
      for (i = 1; i < N; i++) {
      /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
      /* In the previous versions, MSBs of the seed affect */
      /* only MSBs of the array mt[]. */
      /* 2002/01/09 modified by Makoto Matsumoto */
      x = ((uint32_t) 1812433253 * (x ^ (x >> 30)) + (uint32_t) i);
      p->mt[i] = x;
      }
      p->mti = N;
      }

      static CS_NOINLINE void MT_update_state(CsoundRandMTState *p)
      {
      /* mag01[x] = x * MATRIX_A for x=0,1 */
      const uint32_t mag01[2] = { (uint32_t) 0, (uint32_t) MATRIX_A };
      int i;
      uint32_t y;
      for (i = 0; i < (N - M); i++) {
      y = (p->mt[i] & UPPER_MASK) | (p->mt[i + 1] & LOWER_MASK);
      p->mt[i] = p->mt[i + M] ^ (y >> 1) ^ mag01[y & (uint32_t) 1];
      }
      for ( ; i < (N - 1); i++) {
      y = (p->mt[i] & UPPER_MASK) | (p->mt[i + 1] & LOWER_MASK);
      p->mt[i] = p->mt[i + (M - N)] ^ (y >> 1) ^ mag01[y & (uint32_t) 1];
      }
      y = (p->mt[N - 1] & UPPER_MASK) | (p->mt[0] & LOWER_MASK);
      p->mt[N - 1] = p->mt[M - 1] ^ (y >> 1) ^ mag01[y & (uint32_t) 1];
      }

      /* generates a random number on [0,0xffffffff]-interval */

      PUBLIC uint32_t csoundRandMT(CsoundRandMTState *p)
      {
      int i = p->mti;
      uint32_t y;

      if (i >= N) { /* generate N words at one time */
      MT_update_state(p);
      i = 0;
      }
      y = p->mt[i];
      p->mti = i + 1;
      /* Tempering */
      y ^= (y >> 11);
      y ^= (y << 7) & (uint32_t) 0x9D2C5680U;
      y ^= (y << 15) & (uint32_t) 0xEFC60000U;
      y ^= (y >> 18);

      return y;
      }

      volatile uint32_t seed_;

      int main(void)
      {
      CsoundRandMTState st;
      double t0, t1, t2, t3;
      struct timeval tv;
      int i;

      gettimeofday(&tv, NULL);
      t0 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
      seed_ = 12345;
      for (i = 0; i < 100000000; i++)
      seed_ = csoundRand31(seed_);
      gettimeofday(&tv, NULL);
      t1 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
      srand(314159);
      for (i = 0; i < 100000000; i++)
      seed_ = rand();
      gettimeofday(&tv, NULL);
      t2 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
      csoundSeedRandMT(&st, 5489);
      for (i = 0; i < 100000000; i++) {
      seed_ = csoundRandMT(&st);
      /* if (i < 500) {
      printf("%10u ", seed_);
      if ((i % 5) == 4) printf("\n");
      } */
      }
      gettimeofday(&tv, NULL);
      t3 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
      printf("rand31: %.3f, rand: %.3f, MT: %.3f\n",
      (t1 - t0), (t2 - t1), (t3 - t2));
      return 0;
      }
      --
      Send bugs reports to this list.
      To unsubscribe, send email to csound-unsubscribe@...
    • Michael Gogins
      Fantastic! Thanks, Mike ... From: Istvan Varga To: Sent: Tuesday, September 27, 2005 7:11 PM Subject: Re:
      Message 2 of 25 , Sep 27, 2005
        Fantastic!

        Thanks,
        Mike

        ----- Original Message -----
        From: "Istvan Varga" <istvan_v@...>
        To: <csound@...>
        Sent: Tuesday, September 27, 2005 7:11 PM
        Subject: Re: [Csnd] 31 bit pseudo-random number gen in C, C++ & dsPIC
        assembly code


        > Here is a comparison of the time it takes to generate 100,000,000 random
        > numbers with three different generators, compiled with various compilers
        > on Linux.
        >
        > rand31: linear congruential generator (m = 0x7FFFFFFF, a = 742938285)
        > rand: ANSI C rand()
        > (note that it is much slower if not compiled with -static)
        > MT: Mersenne Twister (MT19937)
        >
        > ------------------------------------------------------------------------
        >
        > GCC 4.1 beta (-O2 -march=pentium3 -static)
        >
        > rand31: 1.703, rand: 4.559, MT: 2.501
        >
        > GCC 4.1 beta (-O2 -march=pentium3 -fomit-frame-pointer -static)
        >
        > rand31: 1.731, rand: 4.561, MT: 2.321
        >
        > GCC 4.1 beta (-O2 -march=pentium3)
        >
        > rand31: 1.702, rand: 7.004, MT: 2.500
        >
        > GCC 4.0.0 (-O2 -march=pentium3 -static)
        >
        > rand31: 1.702, rand: 4.561, MT: 2.405
        >
        > GCC 4.0.0 (-O2 -march=pentium3 -fomit-frame-pointer -static)
        >
        > rand31: 1.732, rand: 4.562, MT: 2.339
        >
        > GCC 3.3.4 (-O2 -march=pentium3 -static)
        >
        > rand31: 1.863, rand: 4.172, MT: 2.579
        >
        > GCC 3.3.4 (-O2 -march=pentium3 -fomit-frame-pointer -static)
        >
        > rand31: 1.730, rand: 4.172, MT: 2.385
        >
        > ICC 9.0 (-O2 -xK -static)
        >
        > rand31: 1.841, rand: 4.410, MT: 2.474
        >
        > ------------------------------------------------------------------------
        >
        > #include <stdio.h>
        > #include <stdlib.h>
        > #include <stdint.h>
        > #include <sys/time.h>
        >
        > #define CS_PURE __attribute__ ((__pure__))
        > #define CS_NOINLINE __attribute__ ((__noinline__))
        > #define PUBLIC
        >
        > typedef struct CsoundRandMTState_ {
        > int mti;
        > uint32_t mt[624];
        > } CsoundRandMTState;
        >
        > PUBLIC CS_PURE int csoundRand31(int seedVal)
        > {
        > uint64_t tmp1;
        > uint32_t tmp2;
        >
        > /* x = (742938285 * x) % 0x7FFFFFFF */
        > tmp1 = (uint64_t) ((int32_t) seedVal * (int64_t) 742938285);
        > tmp2 = (uint32_t) tmp1 & (uint32_t) 0x7FFFFFFF;
        > tmp2 += (uint32_t) (tmp1 >> 31);
        > if ((int32_t) tmp2 < (int32_t) 0)
        > tmp2 = (tmp2 + (uint32_t) 1) & (uint32_t) 0x7FFFFFFF;
        > return (int) tmp2;
        > }
        >
        > /* Period parameters */
        >
        > #define N 624
        > #define M 397
        > #define MATRIX_A 0x9908B0DFU /* constant vector a */
        > #define UPPER_MASK 0x80000000U /* most significant w-r bits */
        > #define LOWER_MASK 0x7FFFFFFFU /* least significant r bits */
        >
        > /* initializes mt[N] with a seed */
        >
        > PUBLIC void csoundSeedRandMT(CsoundRandMTState *p, uint32_t seedVal)
        > {
        > int i;
        > uint32_t x;
        >
        > p->mt[0] = x = seedVal;
        > for (i = 1; i < N; i++) {
        > /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
        > /* In the previous versions, MSBs of the seed affect */
        > /* only MSBs of the array mt[]. */
        > /* 2002/01/09 modified by Makoto Matsumoto */
        > x = ((uint32_t) 1812433253 * (x ^ (x >> 30)) + (uint32_t) i);
        > p->mt[i] = x;
        > }
        > p->mti = N;
        > }
        >
        > static CS_NOINLINE void MT_update_state(CsoundRandMTState *p)
        > {
        > /* mag01[x] = x * MATRIX_A for x=0,1 */
        > const uint32_t mag01[2] = { (uint32_t) 0, (uint32_t) MATRIX_A };
        > int i;
        > uint32_t y;
        > for (i = 0; i < (N - M); i++) {
        > y = (p->mt[i] & UPPER_MASK) | (p->mt[i + 1] & LOWER_MASK);
        > p->mt[i] = p->mt[i + M] ^ (y >> 1) ^ mag01[y & (uint32_t) 1];
        > }
        > for ( ; i < (N - 1); i++) {
        > y = (p->mt[i] & UPPER_MASK) | (p->mt[i + 1] & LOWER_MASK);
        > p->mt[i] = p->mt[i + (M - N)] ^ (y >> 1) ^ mag01[y & (uint32_t) 1];
        > }
        > y = (p->mt[N - 1] & UPPER_MASK) | (p->mt[0] & LOWER_MASK);
        > p->mt[N - 1] = p->mt[M - 1] ^ (y >> 1) ^ mag01[y & (uint32_t) 1];
        > }
        >
        > /* generates a random number on [0,0xffffffff]-interval */
        >
        > PUBLIC uint32_t csoundRandMT(CsoundRandMTState *p)
        > {
        > int i = p->mti;
        > uint32_t y;
        >
        > if (i >= N) { /* generate N words at one time */
        > MT_update_state(p);
        > i = 0;
        > }
        > y = p->mt[i];
        > p->mti = i + 1;
        > /* Tempering */
        > y ^= (y >> 11);
        > y ^= (y << 7) & (uint32_t) 0x9D2C5680U;
        > y ^= (y << 15) & (uint32_t) 0xEFC60000U;
        > y ^= (y >> 18);
        >
        > return y;
        > }
        >
        > volatile uint32_t seed_;
        >
        > int main(void)
        > {
        > CsoundRandMTState st;
        > double t0, t1, t2, t3;
        > struct timeval tv;
        > int i;
        >
        > gettimeofday(&tv, NULL);
        > t0 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
        > seed_ = 12345;
        > for (i = 0; i < 100000000; i++)
        > seed_ = csoundRand31(seed_);
        > gettimeofday(&tv, NULL);
        > t1 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
        > srand(314159);
        > for (i = 0; i < 100000000; i++)
        > seed_ = rand();
        > gettimeofday(&tv, NULL);
        > t2 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
        > csoundSeedRandMT(&st, 5489);
        > for (i = 0; i < 100000000; i++) {
        > seed_ = csoundRandMT(&st);
        > /* if (i < 500) {
        > printf("%10u ", seed_);
        > if ((i % 5) == 4) printf("\n");
        > } */
        > }
        > gettimeofday(&tv, NULL);
        > t3 = (double) tv.tv_usec * 0.000001 + (double) tv.tv_sec;
        > printf("rand31: %.3f, rand: %.3f, MT: %.3f\n",
        > (t1 - t0), (t2 - t1), (t3 - t2));
        > return 0;
        > }
        > --
        > Send bugs reports to this list.
        > To unsubscribe, send email to csound-unsubscribe@...
        >


        --
        Send bugs reports to this list.
        To unsubscribe, send email to csound-unsubscribe@...
      • Robin Whittle
        Thanks Istvan for these timing results. What CPU and clock rate were you using? Andrew Simper, on the Music-DSP mailing list (
        Message 3 of 25 , Sep 28, 2005
          Thanks Istvan for these timing results. What CPU and clock rate were
          you using?

          Andrew Simper, on the Music-DSP mailing list (
          http://shoko.calarts.edu/musicdsp ), suggested an exceedingly nifty
          optimisation to the mod(0x7FFFFFFF) for my code:

          > Change
          > if (lo > 0x7FFFFFFF) lo -= 0x7FFFFFFF;
          > to:
          > lo = (lo & 0x7FFFFFFF) + (lo >> 31);

          I think this could also be applied to the 32 x 32 = 64 bit
          multiplication approach you use in csoundRand31().

          I think it is highly likely that an add of two math operations which can
          be done in parallel will be faster than any kind of test and conditional
          branch. This should be applicable to my dsPIC assembly code too.

          Andrew Simper also plans to consider using XOR on the output of the
          basic PRNG to minimise any value correlations. I wrote the following
          about XORing:

          On XORing, I imagine that choosing a suitable fixed 31 bit value would
          do a pretty good job of messing up whatever patterns might exist in the
          value dimension of the output stream. (Shuffling the order of the whole
          output words would do a good job of messing up the time dimension, which
          would also reduce or eliminate short-term value dimension correlations.)

          To choose a fixed XOR value, maybe we could choose a binary
          representation of "pi" or "e", or scoop out some bits from a .wav file
          of Jimi Hendrix playing Purple Haze. Any such XOR factor would ensure
          that the resulting output stream included one instance of 0x7FFFFFFF and
          one instance of 0, whilst removing instances of the XOR factor and its
          complement.

          I was thinking of something more fiendish, but which would be more
          expensive than a fixed XOR value. Suppose we maintained in RAM a 32 bit
          XOR factor, we could make it a left to right shift-register, feeding it
          bit 0 of the raw PRNG result (or bit 31 of the intermediate result
          before the mod() operation). This changing XOR value would surely break
          up any value patterns, but it would introduce a number of other
          "features" which may or may not be desirable.

          Alternatively, the XOR factor could be a counter, incrementing every
          cycle, or being incremented (or added to by some factor) only when
          particular bits were set in the raw PRNG result. That would probably
          extend the cycle length by some significant factor.

          It seems the Mersenne Twister is not at all slow, so in many situations
          there's no point trying to do anything too complex. However, the
          Mersenne Twister:

          http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html

          requires 624 x 32 bit words of RAM, which rules out its use in some
          embedded applications.

          - Robin

          --
          Send bugs reports to this list.
          To unsubscribe, send email to csound-unsubscribe@...
        • Istvan Varga
          ... A rather old Pentium III (Celeron) 1300 MHz. ... Thanks for this tip, it seems to work well, so I will use it. ... Yes, as far as the trick to avoid using
          Message 4 of 25 , Sep 29, 2005
            Robin Whittle wrote:

            > Thanks Istvan for these timing results. What CPU and clock rate were
            > you using?

            A rather old Pentium III (Celeron) 1300 MHz.

            >>Change
            >> if (lo > 0x7FFFFFFF) lo -= 0x7FFFFFFF;
            >>to:
            >> lo = (lo & 0x7FFFFFFF) + (lo >> 31);

            Thanks for this tip, it seems to work well, so I will use it.

            > I think this could also be applied to the 32 x 32 = 64 bit
            > multiplication approach you use in csoundRand31().

            Yes, as far as the trick to avoid using % is concerned, it is
            identical to the code you posted. The only difference is in the
            multiplication; by splitting the multiplication in the suggested
            way, the performance depends less on the compiler and CPU type.
            However, when 32 to 64 bit multiply is indeed available, using
            that is faster, and it also removes the limitation of having to
            use small multipliers.
            Trying to compile with the optimization to remove the conditional,
            and using a large multiplier:

            /* simple linear congruential generator */

            PUBLIC int csoundRand31(int *seedVal)
            {
            uint64_t tmp1;
            uint32_t tmp2;

            /* x = (742938285 * x) % 0x7FFFFFFF */
            tmp1 = (uint64_t) ((int32_t) (*seedVal) * (int64_t) 742938285);
            tmp2 = (uint32_t) tmp1 & (uint32_t) 0x7FFFFFFF;
            tmp2 += (uint32_t) (tmp1 >> 31);
            tmp2 = (tmp2 & (uint32_t) 0x7FFFFFFF) + (tmp2 >> 31);
            (*seedVal) = (int) tmp2;
            return (int) tmp2;
            }

            I get this with GCC 4.0 and 4.1:

            00000000 <csoundRand31>:
            0: 55 push ebp
            1: b8 ad 56 48 2c mov eax,0x2c4856ad
            6: 89 e5 mov ebp,esp
            8: 53 push ebx
            9: 8b 5d 08 mov ebx,DWORD PTR [ebp+8]
            c: f7 2b imul DWORD PTR [ebx]
            e: 89 c1 mov ecx,eax
            10: 81 e1 ff ff ff 7f and ecx,0x7fffffff
            16: 0f ac d0 1f shrd eax,edx,0x1f
            1a: 01 c1 add ecx,eax
            1c: 89 c8 mov eax,ecx
            1e: 25 ff ff ff 7f and eax,0x7fffffff
            23: c1 e9 1f shr ecx,0x1f
            26: 01 c8 add eax,ecx
            28: 89 03 mov DWORD PTR [ebx],eax
            2a: c1 ea 1f shr edx,0x1f
            2d: 5b pop ebx
            2e: 5d pop ebp
            2f: c3 ret

            So, even with a frame pointer, and taking a pointer to the seed instead
            of the seed itself (this makes the function slightly more complex, but
            easier to use), the code fits into 48 bytes.
            Using the previously posted benchmark program, I get about the same
            times when compiled with frame pointer, but the new version is apparently
            faster when -fomit-frame-pointer is used.

            > Andrew Simper also plans to consider using XOR on the output of the
            > basic PRNG to minimise any value correlations. I wrote the following
            > about XORing:

            Does using a large multiplier have an effect on this issue ?

            > It seems the Mersenne Twister is not at all slow, so in many situations
            > there's no point trying to do anything too complex. However, the
            > Mersenne Twister:
            >
            > http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
            >
            > requires 624 x 32 bit words of RAM, which rules out its use in some
            > embedded applications.

            The average CPU usage is indeed quite good (I found it to be better than
            the rand() of glibc on Linux, which is not bad given that rand() is also
            not of particularly high quality), the problem is the memory usage, and
            that some calls are much slower than the average, because on every 624th
            call the whole table needs to be updated.
            So, this generator may be good for use as a single global PRNG, but having
            a separate instance per random opcode is likely to be too expensive.
            I think I will use the Mersenne Twister in the "x-class" random opcodes
            (unirand, trirand, gauss, etc.) and GEN21 which is based on the same code,
            using a single shared generator for an instance of Csound, and use a 31 bit
            LCG in other opcodes where rand() is to be replaced.

            --
            Send bugs reports to this list.
            To unsubscribe, send email to csound-unsubscribe@...
          Your message has been successfully submitted and would be delivered to recipients shortly.