Disabled external gits
This commit is contained in:
16
cs440-acg/ext/pcg32/README.md
Normal file
16
cs440-acg/ext/pcg32/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# pcg32
|
||||
This is a tiny self-contained C++ implementation of the PCG32 random number
|
||||
based on code by Melissa O'Neill available at http://www.pcg-random.org.
|
||||
|
||||
I decided to put together my own version because the official small
|
||||
implementation lacks a C++ interface and various important features (e.g.
|
||||
rewind/difference support, shuffling, floating point sample generation), while
|
||||
while the official C++ version is extremely complex and seems to be intended
|
||||
for research on PRNGs involving the entire PCG family.
|
||||
|
||||
The file ``pcg32_8.h`` contains a vectorized implementation designed by myself
|
||||
which runs eight PCG32 PRNGs in parallel. Expect to get a ~3-4x speedup when
|
||||
generating single or double precision floats.
|
||||
|
||||
Wenzel Jakob
|
||||
June 2016
|
109
cs440-acg/ext/pcg32/pcg32-demo.cpp
Normal file
109
cs440-acg/ext/pcg32/pcg32-demo.cpp
Normal file
@@ -0,0 +1,109 @@
|
||||
/*
|
||||
* PCG Random Number Generation for C.
|
||||
*
|
||||
* Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* For additional information about the PCG random number generation scheme,
|
||||
* including its license and other licensing options, visit
|
||||
*
|
||||
* http://www.pcg-random.org
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is the original demo application from the PCG library ported to the new API
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "pcg32.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
// Read command-line options
|
||||
int rounds = 5;
|
||||
|
||||
if (argc > 1)
|
||||
rounds = atoi(argv[1]);
|
||||
|
||||
pcg32 rng;
|
||||
|
||||
// You should *always* seed the RNG. The usual time to do it is the
|
||||
// point in time when you create RNG (typically at the beginning of the
|
||||
// program).
|
||||
//
|
||||
// pcg32::seed takes two 64-bit constants (the initial state, and the
|
||||
// rng sequence selector; rngs with different sequence selectors will
|
||||
// *never* have random sequences that coincide, at all)
|
||||
rng.seed(42u, 54u);
|
||||
|
||||
printf("pcg32_random_r:\n"
|
||||
" - result: 32-bit unsigned int (uint32_t)\n"
|
||||
" - period: 2^64 (* 2^63 streams)\n"
|
||||
" - state type: pcg32_random_t (%zu bytes)\n"
|
||||
" - output func: XSH-RR\n"
|
||||
"\n",
|
||||
sizeof(pcg32));
|
||||
|
||||
for (int round = 1; round <= rounds; ++round) {
|
||||
printf("Round %d:\n", round);
|
||||
/* Make some 32-bit numbers */
|
||||
printf(" 32bit:");
|
||||
for (int i = 0; i < 6; ++i)
|
||||
printf(" 0x%08x", rng.nextUInt());
|
||||
printf("\n");
|
||||
|
||||
/* Toss some coins */
|
||||
printf(" Coins: ");
|
||||
for (int i = 0; i < 65; ++i)
|
||||
printf("%c", rng.nextUInt(2) ? 'H' : 'T');
|
||||
printf("\n");
|
||||
|
||||
/* Roll some dice */
|
||||
printf(" Rolls:");
|
||||
for (int i = 0; i < 33; ++i) {
|
||||
printf(" %d", (int)rng.nextUInt(6) + 1);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
/* Deal some cards */
|
||||
enum { SUITS = 4, NUMBERS = 13, CARDS = 52 };
|
||||
char cards[CARDS];
|
||||
|
||||
for (int i = 0; i < CARDS; ++i)
|
||||
cards[i] = i;
|
||||
|
||||
rng.shuffle(cards, cards + CARDS);
|
||||
|
||||
printf(" Cards:");
|
||||
static const char number[] = {'A', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'T', 'J', 'Q', 'K'};
|
||||
static const char suit[] = {'h', 'c', 'd', 's'};
|
||||
for (int i = 0; i < CARDS; ++i) {
|
||||
printf(" %c%c", number[cards[i] / SUITS], suit[cards[i] % SUITS]);
|
||||
if ((i + 1) % 22 == 0)
|
||||
printf("\n\t");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
46
cs440-acg/ext/pcg32/pcg32-demo.out
Normal file
46
cs440-acg/ext/pcg32/pcg32-demo.out
Normal file
@@ -0,0 +1,46 @@
|
||||
pcg32_random_r:
|
||||
- result: 32-bit unsigned int (uint32_t)
|
||||
- period: 2^64 (* 2^63 streams)
|
||||
- state type: pcg32_random_t (16 bytes)
|
||||
- output func: XSH-RR
|
||||
|
||||
Round 1:
|
||||
32bit: 0xa15c02b7 0x7b47f409 0xba1d3330 0x83d2f293 0xbfa4784b 0xcbed606e
|
||||
Coins: HHTTTHTHHHTHTTTHHHHHTTTHHHTHTHTHTTHTTTHHHHHHTTTTHHTTTTTHTTTTTTTHT
|
||||
Rolls: 3 4 1 1 2 2 3 2 4 3 2 4 3 3 5 2 3 1 3 1 5 1 4 1 5 6 4 6 6 2 6 3 3
|
||||
Cards: Qd Ks 6d 3s 3d 4c 3h Td Kc 5c Jh Kd Jd As 4s 4h Ad Th Ac Jc 7s Qs
|
||||
2s 7h Kh 2d 6c Ah 4d Qh 9h 6s 5s 2c 9c Ts 8d 9s 3c 8c Js 5d 2h 6h
|
||||
7d 8s 9d 5h 8h Qc 7c Tc
|
||||
|
||||
Round 2:
|
||||
32bit: 0x74ab93ad 0x1c1da000 0x494ff896 0x34462f2f 0xd308a3e5 0x0fa83bab
|
||||
Coins: HHHHHHHHHHTHHHTHTHTHTHTTTTHHTTTHHTHHTHTTHHTTTHHHHHHTHTTHTHTTTTTTT
|
||||
Rolls: 5 1 1 3 3 2 4 5 3 2 2 6 4 3 2 4 2 4 3 2 3 6 3 2 3 4 2 4 1 1 5 4 4
|
||||
Cards: 7d 2s 7h Td 8s 3c 3d Js 2d Tc 4h Qs 5c 9c Th 2c Jc Qd 9d Qc 7s 3s
|
||||
5s 6h 4d Jh 4c Ac 4s 5h 5d Kc 8h 8d Jd 9s Ad 6s 6c Kd 2h 3h Kh Ts
|
||||
Qh 9h 6d As 7c Ks Ah 8c
|
||||
|
||||
Round 3:
|
||||
32bit: 0x39af5f9f 0x04196b18 0xc3c3eb28 0xc076c60c 0xc693e135 0xf8f63932
|
||||
Coins: HTTHHTTTTTHTTHHHTHTTHHTTHTHHTHTHTTTTHHTTTHHTHHTTHTTHHHTHHHTHTTTHT
|
||||
Rolls: 5 1 5 3 2 2 4 5 3 3 1 3 4 6 3 2 3 4 2 2 3 1 5 2 4 6 6 4 2 4 3 3 6
|
||||
Cards: Kd Jh Kc Qh 4d Qc 4h 9d 3c Kh Qs 8h 5c Jd 7d 8d 3h 7c 8s 3s 2h Ks
|
||||
9c 9h 2c 8c Ad 7s 4s 2s 5h 6s 4c Ah 7h 5s Ac 3d 5d Qd As Tc 6h 9s
|
||||
2d 6c 6d Td Jc Ts Th Js
|
||||
|
||||
Round 4:
|
||||
32bit: 0x55ce6851 0x97a7726d 0x17e10815 0x58007d43 0x962fb148 0xb9bb55bd
|
||||
Coins: HHTHHTTTTHTHHHHHTTHHHTTTHHTHTHTHTHHTTHTHHHHHHTHHTHHTHHTTTTHHTHHTT
|
||||
Rolls: 6 6 3 2 3 4 2 6 4 2 6 3 2 3 5 5 3 4 4 6 6 2 6 5 4 4 6 1 6 1 3 6 5
|
||||
Cards: Qd 8h 5d 8s 8d Ts 7h Th Qs Js 7s Kc 6h 5s 4d Ac Jd 7d 7c Td 2c 6s
|
||||
5h 6d 3s Kd 9s Jh Kh As Ah 9h 3c Qh 9c 2d Tc 9d 2s 3d Ks 4h Qc Ad
|
||||
Jc 8c 2h 3h 4s 4c 5c 6c
|
||||
|
||||
Round 5:
|
||||
32bit: 0xfcef7cd6 0x1b488b5a 0xd0daf7ea 0x1d9a70f7 0x241a37cf 0x9a3857b7
|
||||
Coins: HHHHTHHTTHTTHHHTTTHHTHTHTTTTHTTHTHTTTHHHTHTHTTHTTHTHHTHTHHHTHTHTT
|
||||
Rolls: 5 4 1 2 6 1 3 1 5 6 3 6 2 1 4 4 5 2 1 5 6 5 6 4 4 4 5 2 6 4 3 5 6
|
||||
Cards: 4d 9s Qc 9h As Qs 7s 4c Kd 6h 6s 2c 8c 5d 7h 5h Jc 3s 7c Jh Js Ks
|
||||
Tc Jd Kc Th 3h Ts Qh Ad Td 3c Ah 2d 3d 5c Ac 8s 5s 9c 2h 6c 6d Kh
|
||||
Qd 8d 7d 2s 8h 4h 9d 4s
|
||||
|
209
cs440-acg/ext/pcg32/pcg32.h
Normal file
209
cs440-acg/ext/pcg32/pcg32.h
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
* Tiny self-contained version of the PCG Random Number Generation for C++
|
||||
* put together from pieces of the much larger C/C++ codebase.
|
||||
* Wenzel Jakob, February 2015
|
||||
*
|
||||
* The PCG random number generator was developed by Melissa O'Neill
|
||||
* <oneill@pcg-random.org>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* For additional information about the PCG random number generation scheme,
|
||||
* including its license and other licensing options, visit
|
||||
*
|
||||
* http://www.pcg-random.org
|
||||
*/
|
||||
|
||||
#ifndef __PCG32_H
|
||||
#define __PCG32_H 1
|
||||
|
||||
#define PCG32_DEFAULT_STATE 0x853c49e6748fea9bULL
|
||||
#define PCG32_DEFAULT_STREAM 0xda3e39cb94b95bdbULL
|
||||
#define PCG32_MULT 0x5851f42d4c957f2dULL
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
|
||||
/// PCG32 Pseudorandom number generator
|
||||
struct pcg32 {
|
||||
/// Initialize the pseudorandom number generator with default seed
|
||||
pcg32() : state(PCG32_DEFAULT_STATE), inc(PCG32_DEFAULT_STREAM) {}
|
||||
|
||||
/// Initialize the pseudorandom number generator with the \ref seed() function
|
||||
pcg32(uint64_t initstate, uint64_t initseq = 1u) { seed(initstate, initseq); }
|
||||
|
||||
/**
|
||||
* \brief Seed the pseudorandom number generator
|
||||
*
|
||||
* Specified in two parts: a state initializer and a sequence selection
|
||||
* constant (a.k.a. stream id)
|
||||
*/
|
||||
void seed(uint64_t initstate, uint64_t initseq = 1) {
|
||||
state = 0U;
|
||||
inc = (initseq << 1u) | 1u;
|
||||
nextUInt();
|
||||
state += initstate;
|
||||
nextUInt();
|
||||
}
|
||||
|
||||
/// Generate a uniformly distributed unsigned 32-bit random number
|
||||
uint32_t nextUInt() {
|
||||
uint64_t oldstate = state;
|
||||
state = oldstate * PCG32_MULT + inc;
|
||||
uint32_t xorshifted = (uint32_t) (((oldstate >> 18u) ^ oldstate) >> 27u);
|
||||
uint32_t rot = (uint32_t) (oldstate >> 59u);
|
||||
return (xorshifted >> rot) | (xorshifted << ((~rot + 1u) & 31));
|
||||
}
|
||||
|
||||
/// Generate a uniformly distributed number, r, where 0 <= r < bound
|
||||
uint32_t nextUInt(uint32_t bound) {
|
||||
// To avoid bias, we need to make the range of the RNG a multiple of
|
||||
// bound, which we do by dropping output less than a threshold.
|
||||
// A naive scheme to calculate the threshold would be to do
|
||||
//
|
||||
// uint32_t threshold = 0x100000000ull % bound;
|
||||
//
|
||||
// but 64-bit div/mod is slower than 32-bit div/mod (especially on
|
||||
// 32-bit platforms). In essence, we do
|
||||
//
|
||||
// uint32_t threshold = (0x100000000ull-bound) % bound;
|
||||
//
|
||||
// because this version will calculate the same modulus, but the LHS
|
||||
// value is less than 2^32.
|
||||
|
||||
uint32_t threshold = (~bound+1u) % bound;
|
||||
|
||||
// Uniformity guarantees that this loop will terminate. In practice, it
|
||||
// should usually terminate quickly; on average (assuming all bounds are
|
||||
// equally likely), 82.25% of the time, we can expect it to require just
|
||||
// one iteration. In the worst case, someone passes a bound of 2^31 + 1
|
||||
// (i.e., 2147483649), which invalidates almost 50% of the range. In
|
||||
// practice, bounds are typically small and only a tiny amount of the range
|
||||
// is eliminated.
|
||||
for (;;) {
|
||||
uint32_t r = nextUInt();
|
||||
if (r >= threshold)
|
||||
return r % bound;
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a single precision floating point value on the interval [0, 1)
|
||||
float nextFloat() {
|
||||
/* Trick from MTGP: generate an uniformly distributed
|
||||
single precision number in [1,2) and subtract 1. */
|
||||
union {
|
||||
uint32_t u;
|
||||
float f;
|
||||
} x;
|
||||
x.u = (nextUInt() >> 9) | 0x3f800000u;
|
||||
return x.f - 1.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Generate a double precision floating point value on the interval [0, 1)
|
||||
*
|
||||
* \remark Since the underlying random number generator produces 32 bit output,
|
||||
* only the first 32 mantissa bits will be filled (however, the resolution is still
|
||||
* finer than in \ref nextFloat(), which only uses 23 mantissa bits)
|
||||
*/
|
||||
double nextDouble() {
|
||||
/* Trick from MTGP: generate an uniformly distributed
|
||||
double precision number in [1,2) and subtract 1. */
|
||||
union {
|
||||
uint64_t u;
|
||||
double d;
|
||||
} x;
|
||||
x.u = ((uint64_t) nextUInt() << 20) | 0x3ff0000000000000ULL;
|
||||
return x.d - 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Multi-step advance function (jump-ahead, jump-back)
|
||||
*
|
||||
* The method used here is based on Brown, "Random Number Generation
|
||||
* with Arbitrary Stride", Transactions of the American Nuclear
|
||||
* Society (Nov. 1994). The algorithm is very similar to fast
|
||||
* exponentiation.
|
||||
*/
|
||||
void advance(int64_t delta_) {
|
||||
uint64_t
|
||||
cur_mult = PCG32_MULT,
|
||||
cur_plus = inc,
|
||||
acc_mult = 1u,
|
||||
acc_plus = 0u;
|
||||
|
||||
/* Even though delta is an unsigned integer, we can pass a signed
|
||||
integer to go backwards, it just goes "the long way round". */
|
||||
uint64_t delta = (uint64_t) delta_;
|
||||
|
||||
while (delta > 0) {
|
||||
if (delta & 1) {
|
||||
acc_mult *= cur_mult;
|
||||
acc_plus = acc_plus * cur_mult + cur_plus;
|
||||
}
|
||||
cur_plus = (cur_mult + 1) * cur_plus;
|
||||
cur_mult *= cur_mult;
|
||||
delta /= 2;
|
||||
}
|
||||
state = acc_mult * state + acc_plus;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Draw uniformly distributed permutation and permute the
|
||||
* given STL container
|
||||
*
|
||||
* From: Knuth, TAoCP Vol. 2 (3rd 3d), Section 3.4.2
|
||||
*/
|
||||
template <typename Iterator> void shuffle(Iterator begin, Iterator end) {
|
||||
for (Iterator it = end - 1; it > begin; --it)
|
||||
std::iter_swap(it, begin + nextUInt((uint32_t) (it - begin + 1)));
|
||||
}
|
||||
|
||||
/// Compute the distance between two PCG32 pseudorandom number generators
|
||||
int64_t operator-(const pcg32 &other) const {
|
||||
assert(inc == other.inc);
|
||||
|
||||
uint64_t
|
||||
cur_mult = PCG32_MULT,
|
||||
cur_plus = inc,
|
||||
cur_state = other.state,
|
||||
the_bit = 1u,
|
||||
distance = 0u;
|
||||
|
||||
while (state != cur_state) {
|
||||
if ((state & the_bit) != (cur_state & the_bit)) {
|
||||
cur_state = cur_state * cur_mult + cur_plus;
|
||||
distance |= the_bit;
|
||||
}
|
||||
assert((state & the_bit) == (cur_state & the_bit));
|
||||
the_bit <<= 1;
|
||||
cur_plus = (cur_mult + 1ULL) * cur_plus;
|
||||
cur_mult *= cur_mult;
|
||||
}
|
||||
|
||||
return (int64_t) distance;
|
||||
}
|
||||
|
||||
/// Equality operator
|
||||
bool operator==(const pcg32 &other) const { return state == other.state && inc == other.inc; }
|
||||
|
||||
/// Inequality operator
|
||||
bool operator!=(const pcg32 &other) const { return state != other.state || inc != other.inc; }
|
||||
|
||||
uint64_t state; // RNG state. All values are possible.
|
||||
uint64_t inc; // Controls which RNG sequence (stream) is selected. Must *always* be odd.
|
||||
};
|
||||
|
||||
#endif // __PCG32_H
|
284
cs440-acg/ext/pcg32/pcg32_8.h
Normal file
284
cs440-acg/ext/pcg32/pcg32_8.h
Normal file
@@ -0,0 +1,284 @@
|
||||
/*
|
||||
* Vectorized AVX2 version of the PCG32 random number generator developed by
|
||||
* Wenzel Jakob (June 2016)
|
||||
*
|
||||
* The PCG random number generator was developed by Melissa O'Neill
|
||||
* <oneill@pcg-random.org>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* For additional information about the PCG random number generation scheme,
|
||||
* including its license and other licensing options, visit
|
||||
*
|
||||
* http://www.pcg-random.org
|
||||
*/
|
||||
|
||||
#include "pcg32.h"
|
||||
#include <immintrin.h>
|
||||
#include <utility>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# define PCG32_ALIGN(amt) __declspec(align(amt))
|
||||
# define PCG32_VECTORCALL __vectorcall
|
||||
# define PCG32_INLINE __forceinline
|
||||
#else
|
||||
# define PCG32_ALIGN(amt) __attribute__ ((aligned(amt)))
|
||||
# define PCG32_INLINE __attribute__ ((always_inline))
|
||||
# if defined(__clang__)
|
||||
# define PCG32_VECTORCALL __attribute__ ((vectorcall))
|
||||
# else
|
||||
# define PCG32_VECTORCALL
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/// 8 parallel PCG32 pseudorandom number generators
|
||||
struct PCG32_ALIGN(32) pcg32_8 {
|
||||
|
||||
#if defined(__AVX2__)
|
||||
__m256i state[2]; // RNG state. All values are possible.
|
||||
__m256i inc[2]; // Controls which RNG sequence (stream) is selected. Must *always* be odd.
|
||||
#else
|
||||
/* Scalar fallback */
|
||||
pcg32 rng[8];
|
||||
#endif
|
||||
|
||||
/// Initialize the pseudorandom number generator with default seed
|
||||
pcg32_8() {
|
||||
PCG32_ALIGN(32) uint64_t initstate[8] = {
|
||||
PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE,
|
||||
PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE,
|
||||
PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE,
|
||||
PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE
|
||||
};
|
||||
|
||||
PCG32_ALIGN(32) uint64_t initseq[8] =
|
||||
{ 1, 2, 3, 4, 5, 6, 7, 8 };
|
||||
|
||||
seed(initstate, initseq);
|
||||
}
|
||||
|
||||
/// Initialize the pseudorandom number generator with the \ref seed() function
|
||||
pcg32_8(const uint64_t initstate[8], const uint64_t initseq[8]) {
|
||||
seed(initstate, initseq);
|
||||
}
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
/**
|
||||
* \brief Seed the pseudorandom number generator
|
||||
*
|
||||
* Specified in two parts: a state initializer and a sequence selection
|
||||
* constant (a.k.a. stream id)
|
||||
*/
|
||||
void seed(const uint64_t initstate[8], const uint64_t initseq[8]) {
|
||||
const __m256i one = _mm256_set1_epi64x((long long) 1);
|
||||
|
||||
state[0] = state[1] = _mm256_setzero_si256();
|
||||
inc[0] = _mm256_or_si256(
|
||||
_mm256_slli_epi64(_mm256_load_si256((__m256i *) &initseq[0]), 1),
|
||||
one);
|
||||
inc[1] = _mm256_or_si256(
|
||||
_mm256_slli_epi64(_mm256_load_si256((__m256i *) &initseq[4]), 1),
|
||||
one);
|
||||
step();
|
||||
|
||||
state[0] = _mm256_add_epi64(state[0], _mm256_load_si256((__m256i *) &initstate[0]));
|
||||
state[1] = _mm256_add_epi64(state[1], _mm256_load_si256((__m256i *) &initstate[4]));
|
||||
|
||||
step();
|
||||
}
|
||||
|
||||
/// Generate 8 uniformly distributed unsigned 32-bit random numbers
|
||||
void nextUInt(uint32_t result[8]) {
|
||||
_mm256_store_si256((__m256i *) result, step());
|
||||
}
|
||||
|
||||
/// Generate 8 uniformly distributed unsigned 32-bit random numbers
|
||||
__m256i PCG32_VECTORCALL nextUInt() {
|
||||
return step();
|
||||
}
|
||||
|
||||
/// Generate eight single precision floating point value on the interval [0, 1)
|
||||
__m256 PCG32_VECTORCALL nextFloat() {
|
||||
/* Trick from MTGP: generate an uniformly distributed
|
||||
single precision number in [1,2) and subtract 1. */
|
||||
|
||||
const __m256i const1 = _mm256_set1_epi32((int) 0x3f800000u);
|
||||
|
||||
__m256i value = step();
|
||||
__m256i fltval = _mm256_or_si256(_mm256_srli_epi32(value, 9), const1);
|
||||
|
||||
return _mm256_sub_ps(_mm256_castsi256_ps(fltval),
|
||||
_mm256_castsi256_ps(const1));
|
||||
}
|
||||
|
||||
/// Generate eight single precision floating point value on the interval [0, 1)
|
||||
void nextFloat(float result[8]) {
|
||||
_mm256_store_ps(result, nextFloat());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Generate eight double precision floating point value on the interval [0, 1)
|
||||
*
|
||||
* \remark Since the underlying random number generator produces 32 bit output,
|
||||
* only the first 32 mantissa bits will be filled (however, the resolution is still
|
||||
* finer than in \ref nextFloat(), which only uses 23 mantissa bits)
|
||||
*/
|
||||
std::pair<__m256d, __m256d> nextDouble() {
|
||||
/* Trick from MTGP: generate an uniformly distributed
|
||||
double precision number in [1,2) and subtract 1. */
|
||||
|
||||
const __m256i const1 =
|
||||
_mm256_set1_epi64x((long long) 0x3ff0000000000000ull);
|
||||
|
||||
__m256i value = step();
|
||||
|
||||
__m256i lo = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(value));
|
||||
__m256i hi = _mm256_cvtepu32_epi64(_mm256_extractf128_si256(value, 1));
|
||||
|
||||
__m256i tlo = _mm256_or_si256(_mm256_slli_epi64(lo, 20), const1);
|
||||
__m256i thi = _mm256_or_si256(_mm256_slli_epi64(hi, 20), const1);
|
||||
|
||||
__m256d flo = _mm256_sub_pd(_mm256_castsi256_pd(tlo),
|
||||
_mm256_castsi256_pd(const1));
|
||||
|
||||
__m256d fhi = _mm256_sub_pd(_mm256_castsi256_pd(thi),
|
||||
_mm256_castsi256_pd(const1));
|
||||
|
||||
return std::make_pair(flo, fhi);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Generate eight double precision floating point value on the interval [0, 1)
|
||||
*
|
||||
* \remark Since the underlying random number generator produces 32 bit output,
|
||||
* only the first 32 mantissa bits will be filled (however, the resolution is still
|
||||
* finer than in \ref nextFloat(), which only uses 23 mantissa bits)
|
||||
*/
|
||||
void nextDouble(double result[8]) {
|
||||
std::pair<__m256d, __m256d> value = nextDouble();
|
||||
|
||||
_mm256_store_pd(&result[0], value.first);
|
||||
_mm256_store_pd(&result[4], value.second);
|
||||
}
|
||||
|
||||
private:
|
||||
PCG32_INLINE __m256i PCG32_VECTORCALL step() {
|
||||
const __m256i pcg32_mult_l = _mm256_set1_epi64x((long long) (PCG32_MULT & 0xffffffffu));
|
||||
const __m256i pcg32_mult_h = _mm256_set1_epi64x((long long) (PCG32_MULT >> 32));
|
||||
const __m256i mask_l = _mm256_set1_epi64x((long long) 0x00000000ffffffffull);
|
||||
const __m256i shift0 = _mm256_set_epi32(7, 7, 7, 7, 6, 4, 2, 0);
|
||||
const __m256i shift1 = _mm256_set_epi32(6, 4, 2, 0, 7, 7, 7, 7);
|
||||
const __m256i const32 = _mm256_set1_epi32(32);
|
||||
|
||||
__m256i s0 = state[0], s1 = state[1];
|
||||
|
||||
/* Extract low and high words for partial products below */
|
||||
__m256i s0_l = _mm256_and_si256(s0, mask_l);
|
||||
__m256i s0_h = _mm256_srli_epi64(s0, 32);
|
||||
__m256i s1_l = _mm256_and_si256(s1, mask_l);
|
||||
__m256i s1_h = _mm256_srli_epi64(s1, 32);
|
||||
|
||||
/* Improve high bits using xorshift step */
|
||||
__m256i s0s = _mm256_srli_epi64(s0, 18);
|
||||
__m256i s1s = _mm256_srli_epi64(s1, 18);
|
||||
|
||||
__m256i s0x = _mm256_xor_si256(s0s, s0);
|
||||
__m256i s1x = _mm256_xor_si256(s1s, s1);
|
||||
|
||||
__m256i s0xs = _mm256_srli_epi64(s0x, 27);
|
||||
__m256i s1xs = _mm256_srli_epi64(s1x, 27);
|
||||
|
||||
__m256i xors0 = _mm256_and_si256(mask_l, s0xs);
|
||||
__m256i xors1 = _mm256_and_si256(mask_l, s1xs);
|
||||
|
||||
/* Use high bits to choose a bit-level rotation */
|
||||
__m256i rot0 = _mm256_srli_epi64(s0, 59);
|
||||
__m256i rot1 = _mm256_srli_epi64(s1, 59);
|
||||
|
||||
/* 64 bit multiplication using 32 bit partial products :( */
|
||||
__m256i m0_hl = _mm256_mul_epu32(s0_h, pcg32_mult_l);
|
||||
__m256i m1_hl = _mm256_mul_epu32(s1_h, pcg32_mult_l);
|
||||
__m256i m0_lh = _mm256_mul_epu32(s0_l, pcg32_mult_h);
|
||||
__m256i m1_lh = _mm256_mul_epu32(s1_l, pcg32_mult_h);
|
||||
|
||||
/* Assemble lower 32 bits, will be merged into one 256 bit vector below */
|
||||
xors0 = _mm256_permutevar8x32_epi32(xors0, shift0);
|
||||
rot0 = _mm256_permutevar8x32_epi32(rot0, shift0);
|
||||
xors1 = _mm256_permutevar8x32_epi32(xors1, shift1);
|
||||
rot1 = _mm256_permutevar8x32_epi32(rot1, shift1);
|
||||
|
||||
/* Continue with partial products */
|
||||
__m256i m0_ll = _mm256_mul_epu32(s0_l, pcg32_mult_l);
|
||||
__m256i m1_ll = _mm256_mul_epu32(s1_l, pcg32_mult_l);
|
||||
|
||||
__m256i m0h = _mm256_add_epi64(m0_hl, m0_lh);
|
||||
__m256i m1h = _mm256_add_epi64(m1_hl, m1_lh);
|
||||
|
||||
__m256i m0hs = _mm256_slli_epi64(m0h, 32);
|
||||
__m256i m1hs = _mm256_slli_epi64(m1h, 32);
|
||||
|
||||
__m256i s0n = _mm256_add_epi64(m0hs, m0_ll);
|
||||
__m256i s1n = _mm256_add_epi64(m1hs, m1_ll);
|
||||
|
||||
__m256i xors = _mm256_or_si256(xors0, xors1);
|
||||
__m256i rot = _mm256_or_si256(rot0, rot1);
|
||||
|
||||
state[0] = _mm256_add_epi64(s0n, inc[0]);
|
||||
state[1] = _mm256_add_epi64(s1n, inc[1]);
|
||||
|
||||
/* Finally, rotate and return the result */
|
||||
__m256i result = _mm256_or_si256(
|
||||
_mm256_srlv_epi32(xors, rot),
|
||||
_mm256_sllv_epi32(xors, _mm256_sub_epi32(const32, rot))
|
||||
);
|
||||
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
/**
|
||||
* \brief Seed the pseudorandom number generator
|
||||
*
|
||||
* Specified in two parts: a state initializer and a sequence selection
|
||||
* constant (a.k.a. stream id)
|
||||
*/
|
||||
void seed(const uint64_t initstate[8], const uint64_t initseq[8]) {
|
||||
for (int i = 0; i < 8; ++i)
|
||||
rng[i].seed(initstate[i], initseq[i]);
|
||||
}
|
||||
|
||||
/// Generate 8 uniformly distributed unsigned 32-bit random numbers
|
||||
void nextUInt(uint32_t result[8]) {
|
||||
for (int i = 0; i < 8; ++i)
|
||||
result[i] = rng[i].nextUInt();
|
||||
}
|
||||
|
||||
/// Generate eight single precision floating point value on the interval [0, 1)
|
||||
void nextFloat(float result[8]) {
|
||||
for (int i = 0; i < 8; ++i)
|
||||
result[i] = rng[i].nextFloat();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Generate eight double precision floating point value on the interval [0, 1)
|
||||
*
|
||||
* \remark Since the underlying random number generator produces 32 bit output,
|
||||
* only the first 32 mantissa bits will be filled (however, the resolution is still
|
||||
* finer than in \ref nextFloat(), which only uses 23 mantissa bits)
|
||||
*/
|
||||
void nextDouble(double result[8]) {
|
||||
for (int i = 0; i < 8; ++i)
|
||||
result[i] = rng[i].nextDouble();
|
||||
}
|
||||
#endif
|
||||
};
|
Reference in New Issue
Block a user