Disabled external gits

2022-04-07 18:46:57 +02:00
parent 88cb3426ad
commit 15e7120d6d
5316 changed files with 4563444 additions and 6 deletions
--- a/cs440-acg/ext/pcg32/README.md
+++ b/cs440-acg/ext/pcg32/README.md
@@ -0,0 +1,16 @@
+# pcg32
+This is a tiny self-contained C++ implementation of the PCG32 random number
+based on code by Melissa O'Neill available at http://www.pcg-random.org.
+
+I decided to put together my own version because the official small
+implementation lacks a C++ interface and various important features (e.g.
+rewind/difference support, shuffling, floating point sample generation), while
+while the official C++ version is extremely complex and seems to be intended
+for research on PRNGs involving the entire PCG family.
+
+The file ``pcg32_8.h`` contains a vectorized implementation designed by myself
+which runs eight PCG32 PRNGs in parallel. Expect to get a ~3-4x speedup when
+generating single or double precision floats.
+
+Wenzel Jakob
+June 2016
--- a/cs440-acg/ext/pcg32/pcg32-demo.cpp
+++ b/cs440-acg/ext/pcg32/pcg32-demo.cpp
@@ -0,0 +1,109 @@
+/*
+ * PCG Random Number Generation for C.
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+/*
+ * This is the original demo application from the PCG library ported to the new API
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+
+#include "pcg32.h"
+
+int main(int argc, char** argv) {
+    // Read command-line options
+    int rounds = 5;
+
+    if (argc > 1)
+        rounds = atoi(argv[1]);
+
+    pcg32 rng;
+
+    // You should *always* seed the RNG.  The usual time to do it is the
+    // point in time when you create RNG (typically at the beginning of the
+    // program).
+    //
+    // pcg32::seed takes two 64-bit constants (the initial state, and the
+    // rng sequence selector; rngs with different sequence selectors will
+    // *never* have random sequences that coincide, at all)
+    rng.seed(42u, 54u);
+
+    printf("pcg32_random_r:\n"
+           "      -  result:      32-bit unsigned int (uint32_t)\n"
+           "      -  period:      2^64   (* 2^63 streams)\n"
+           "      -  state type:  pcg32_random_t (%zu bytes)\n"
+           "      -  output func: XSH-RR\n"
+           "\n",
+           sizeof(pcg32));
+
+    for (int round = 1; round <= rounds; ++round) {
+        printf("Round %d:\n", round);
+        /* Make some 32-bit numbers */
+        printf("  32bit:");
+        for (int i = 0; i < 6; ++i)
+            printf(" 0x%08x", rng.nextUInt());
+        printf("\n");
+
+        /* Toss some coins */
+        printf("  Coins: ");
+        for (int i = 0; i < 65; ++i)
+            printf("%c", rng.nextUInt(2) ? 'H' : 'T');
+        printf("\n");
+
+        /* Roll some dice */
+        printf("  Rolls:");
+        for (int i = 0; i < 33; ++i) {
+            printf(" %d", (int)rng.nextUInt(6) + 1);
+        }
+        printf("\n");
+
+        /* Deal some cards */
+        enum { SUITS = 4, NUMBERS = 13, CARDS = 52 };
+        char cards[CARDS];
+
+        for (int i = 0; i < CARDS; ++i)
+            cards[i] = i;
+
+        rng.shuffle(cards, cards + CARDS);
+
+        printf("  Cards:");
+        static const char number[] = {'A', '2', '3', '4', '5', '6', '7',
+                                      '8', '9', 'T', 'J', 'Q', 'K'};
+        static const char suit[] = {'h', 'c', 'd', 's'};
+        for (int i = 0; i < CARDS; ++i) {
+            printf(" %c%c", number[cards[i] / SUITS], suit[cards[i] % SUITS]);
+            if ((i + 1) % 22 == 0)
+                printf("\n\t");
+        }
+        printf("\n");
+
+        printf("\n");
+    }
+
+    return 0;
+}
--- a/cs440-acg/ext/pcg32/pcg32-demo.out
+++ b/cs440-acg/ext/pcg32/pcg32-demo.out
@@ -0,0 +1,46 @@
+pcg32_random_r:
+      -  result:      32-bit unsigned int (uint32_t)
+      -  period:      2^64   (* 2^63 streams)
+      -  state type:  pcg32_random_t (16 bytes)
+      -  output func: XSH-RR
+
+Round 1:
+  32bit: 0xa15c02b7 0x7b47f409 0xba1d3330 0x83d2f293 0xbfa4784b 0xcbed606e
+  Coins: HHTTTHTHHHTHTTTHHHHHTTTHHHTHTHTHTTHTTTHHHHHHTTTTHHTTTTTHTTTTTTTHT
+  Rolls: 3 4 1 1 2 2 3 2 4 3 2 4 3 3 5 2 3 1 3 1 5 1 4 1 5 6 4 6 6 2 6 3 3
+  Cards: Qd Ks 6d 3s 3d 4c 3h Td Kc 5c Jh Kd Jd As 4s 4h Ad Th Ac Jc 7s Qs
+	 2s 7h Kh 2d 6c Ah 4d Qh 9h 6s 5s 2c 9c Ts 8d 9s 3c 8c Js 5d 2h 6h
+	 7d 8s 9d 5h 8h Qc 7c Tc
+
+Round 2:
+  32bit: 0x74ab93ad 0x1c1da000 0x494ff896 0x34462f2f 0xd308a3e5 0x0fa83bab
+  Coins: HHHHHHHHHHTHHHTHTHTHTHTTTTHHTTTHHTHHTHTTHHTTTHHHHHHTHTTHTHTTTTTTT
+  Rolls: 5 1 1 3 3 2 4 5 3 2 2 6 4 3 2 4 2 4 3 2 3 6 3 2 3 4 2 4 1 1 5 4 4
+  Cards: 7d 2s 7h Td 8s 3c 3d Js 2d Tc 4h Qs 5c 9c Th 2c Jc Qd 9d Qc 7s 3s
+	 5s 6h 4d Jh 4c Ac 4s 5h 5d Kc 8h 8d Jd 9s Ad 6s 6c Kd 2h 3h Kh Ts
+	 Qh 9h 6d As 7c Ks Ah 8c
+
+Round 3:
+  32bit: 0x39af5f9f 0x04196b18 0xc3c3eb28 0xc076c60c 0xc693e135 0xf8f63932
+  Coins: HTTHHTTTTTHTTHHHTHTTHHTTHTHHTHTHTTTTHHTTTHHTHHTTHTTHHHTHHHTHTTTHT
+  Rolls: 5 1 5 3 2 2 4 5 3 3 1 3 4 6 3 2 3 4 2 2 3 1 5 2 4 6 6 4 2 4 3 3 6
+  Cards: Kd Jh Kc Qh 4d Qc 4h 9d 3c Kh Qs 8h 5c Jd 7d 8d 3h 7c 8s 3s 2h Ks
+	 9c 9h 2c 8c Ad 7s 4s 2s 5h 6s 4c Ah 7h 5s Ac 3d 5d Qd As Tc 6h 9s
+	 2d 6c 6d Td Jc Ts Th Js
+
+Round 4:
+  32bit: 0x55ce6851 0x97a7726d 0x17e10815 0x58007d43 0x962fb148 0xb9bb55bd
+  Coins: HHTHHTTTTHTHHHHHTTHHHTTTHHTHTHTHTHHTTHTHHHHHHTHHTHHTHHTTTTHHTHHTT
+  Rolls: 6 6 3 2 3 4 2 6 4 2 6 3 2 3 5 5 3 4 4 6 6 2 6 5 4 4 6 1 6 1 3 6 5
+  Cards: Qd 8h 5d 8s 8d Ts 7h Th Qs Js 7s Kc 6h 5s 4d Ac Jd 7d 7c Td 2c 6s
+	 5h 6d 3s Kd 9s Jh Kh As Ah 9h 3c Qh 9c 2d Tc 9d 2s 3d Ks 4h Qc Ad
+	 Jc 8c 2h 3h 4s 4c 5c 6c
+
+Round 5:
+  32bit: 0xfcef7cd6 0x1b488b5a 0xd0daf7ea 0x1d9a70f7 0x241a37cf 0x9a3857b7
+  Coins: HHHHTHHTTHTTHHHTTTHHTHTHTTTTHTTHTHTTTHHHTHTHTTHTTHTHHTHTHHHTHTHTT
+  Rolls: 5 4 1 2 6 1 3 1 5 6 3 6 2 1 4 4 5 2 1 5 6 5 6 4 4 4 5 2 6 4 3 5 6
+  Cards: 4d 9s Qc 9h As Qs 7s 4c Kd 6h 6s 2c 8c 5d 7h 5h Jc 3s 7c Jh Js Ks
+	 Tc Jd Kc Th 3h Ts Qh Ad Td 3c Ah 2d 3d 5c Ac 8s 5s 9c 2h 6c 6d Kh
+	 Qd 8d 7d 2s 8h 4h 9d 4s
+
--- a/cs440-acg/ext/pcg32/pcg32.h
+++ b/cs440-acg/ext/pcg32/pcg32.h
@@ -0,0 +1,209 @@
+/*
+ * Tiny self-contained version of the PCG Random Number Generation for C++
+ * put together from pieces of the much larger C/C++ codebase.
+ * Wenzel Jakob, February 2015
+ *
+ * The PCG random number generator was developed by Melissa O'Neill
+ * <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+#ifndef __PCG32_H
+#define __PCG32_H 1
+
+#define PCG32_DEFAULT_STATE  0x853c49e6748fea9bULL
+#define PCG32_DEFAULT_STREAM 0xda3e39cb94b95bdbULL
+#define PCG32_MULT           0x5851f42d4c957f2dULL
+
+#include <inttypes.h>
+#include <cmath>
+#include <cassert>
+#include <algorithm>
+
+/// PCG32 Pseudorandom number generator
+struct pcg32 {
+    /// Initialize the pseudorandom number generator with default seed
+    pcg32() : state(PCG32_DEFAULT_STATE), inc(PCG32_DEFAULT_STREAM) {}
+
+    /// Initialize the pseudorandom number generator with the \ref seed() function
+    pcg32(uint64_t initstate, uint64_t initseq = 1u) { seed(initstate, initseq); }
+
+    /**
+     * \brief Seed the pseudorandom number generator
+     *
+     * Specified in two parts: a state initializer and a sequence selection
+     * constant (a.k.a. stream id)
+     */
+    void seed(uint64_t initstate, uint64_t initseq = 1) {
+        state = 0U;
+        inc = (initseq << 1u) | 1u;
+        nextUInt();
+        state += initstate;
+        nextUInt();
+    }
+
+    /// Generate a uniformly distributed unsigned 32-bit random number
+    uint32_t nextUInt() {
+        uint64_t oldstate = state;
+        state = oldstate * PCG32_MULT + inc;
+        uint32_t xorshifted = (uint32_t) (((oldstate >> 18u) ^ oldstate) >> 27u);
+        uint32_t rot = (uint32_t) (oldstate >> 59u);
+        return (xorshifted >> rot) | (xorshifted << ((~rot + 1u) & 31));
+    }
+
+    /// Generate a uniformly distributed number, r, where 0 <= r < bound
+    uint32_t nextUInt(uint32_t bound) {
+        // To avoid bias, we need to make the range of the RNG a multiple of
+        // bound, which we do by dropping output less than a threshold.
+        // A naive scheme to calculate the threshold would be to do
+        //
+        //     uint32_t threshold = 0x100000000ull % bound;
+        //
+        // but 64-bit div/mod is slower than 32-bit div/mod (especially on
+        // 32-bit platforms).  In essence, we do
+        //
+        //     uint32_t threshold = (0x100000000ull-bound) % bound;
+        //
+        // because this version will calculate the same modulus, but the LHS
+        // value is less than 2^32.
+
+        uint32_t threshold = (~bound+1u) % bound;
+
+        // Uniformity guarantees that this loop will terminate.  In practice, it
+        // should usually terminate quickly; on average (assuming all bounds are
+        // equally likely), 82.25% of the time, we can expect it to require just
+        // one iteration.  In the worst case, someone passes a bound of 2^31 + 1
+        // (i.e., 2147483649), which invalidates almost 50% of the range.  In
+        // practice, bounds are typically small and only a tiny amount of the range
+        // is eliminated.
+        for (;;) {
+            uint32_t r = nextUInt();
+            if (r >= threshold)
+                return r % bound;
+        }
+    }
+
+    /// Generate a single precision floating point value on the interval [0, 1)
+    float nextFloat() {
+        /* Trick from MTGP: generate an uniformly distributed
+           single precision number in [1,2) and subtract 1. */
+        union {
+            uint32_t u;
+            float f;
+        } x;
+        x.u = (nextUInt() >> 9) | 0x3f800000u;
+        return x.f - 1.0f;
+    }
+
+    /**
+     * \brief Generate a double precision floating point value on the interval [0, 1)
+     *
+     * \remark Since the underlying random number generator produces 32 bit output,
+     * only the first 32 mantissa bits will be filled (however, the resolution is still
+     * finer than in \ref nextFloat(), which only uses 23 mantissa bits)
+     */
+    double nextDouble() {
+        /* Trick from MTGP: generate an uniformly distributed
+           double precision number in [1,2) and subtract 1. */
+        union {
+            uint64_t u;
+            double d;
+        } x;
+        x.u = ((uint64_t) nextUInt() << 20) | 0x3ff0000000000000ULL;
+        return x.d - 1.0;
+    }
+
+    /**
+     * \brief Multi-step advance function (jump-ahead, jump-back)
+     *
+     * The method used here is based on Brown, "Random Number Generation
+     * with Arbitrary Stride", Transactions of the American Nuclear
+     * Society (Nov. 1994). The algorithm is very similar to fast
+     * exponentiation.
+     */
+    void advance(int64_t delta_) {
+        uint64_t
+            cur_mult = PCG32_MULT,
+            cur_plus = inc,
+            acc_mult = 1u,
+            acc_plus = 0u;
+
+        /* Even though delta is an unsigned integer, we can pass a signed
+           integer to go backwards, it just goes "the long way round". */
+        uint64_t delta = (uint64_t) delta_;
+
+        while (delta > 0) {
+            if (delta & 1) {
+                acc_mult *= cur_mult;
+                acc_plus = acc_plus * cur_mult + cur_plus;
+            }
+            cur_plus = (cur_mult + 1) * cur_plus;
+            cur_mult *= cur_mult;
+            delta /= 2;
+        }
+        state = acc_mult * state + acc_plus;
+    }
+
+    /**
+     * \brief Draw uniformly distributed permutation and permute the
+     * given STL container
+     *
+     * From: Knuth, TAoCP Vol. 2 (3rd 3d), Section 3.4.2
+     */
+    template <typename Iterator> void shuffle(Iterator begin, Iterator end) {
+        for (Iterator it = end - 1; it > begin; --it)
+            std::iter_swap(it, begin + nextUInt((uint32_t) (it - begin + 1)));
+    }
+
+    /// Compute the distance between two PCG32 pseudorandom number generators
+    int64_t operator-(const pcg32 &other) const {
+        assert(inc == other.inc);
+
+        uint64_t
+            cur_mult = PCG32_MULT,
+            cur_plus = inc,
+            cur_state = other.state,
+            the_bit = 1u,
+            distance = 0u;
+
+        while (state != cur_state) {
+            if ((state & the_bit) != (cur_state & the_bit)) {
+                cur_state = cur_state * cur_mult + cur_plus;
+                distance |= the_bit;
+            }
+            assert((state & the_bit) == (cur_state & the_bit));
+            the_bit <<= 1;
+            cur_plus = (cur_mult + 1ULL) * cur_plus;
+            cur_mult *= cur_mult;
+        }
+
+        return (int64_t) distance;
+    }
+
+    /// Equality operator
+    bool operator==(const pcg32 &other) const { return state == other.state && inc == other.inc; }
+
+    /// Inequality operator
+    bool operator!=(const pcg32 &other) const { return state != other.state || inc != other.inc; }
+
+    uint64_t state;  // RNG state.  All values are possible.
+    uint64_t inc;    // Controls which RNG sequence (stream) is selected. Must *always* be odd.
+};
+
+#endif // __PCG32_H
--- a/cs440-acg/ext/pcg32/pcg32_8.h
+++ b/cs440-acg/ext/pcg32/pcg32_8.h
@@ -0,0 +1,284 @@
+/*
+ * Vectorized AVX2 version of the PCG32 random number generator developed by
+ * Wenzel Jakob (June 2016)
+ *
+ * The PCG random number generator was developed by Melissa O'Neill
+ * <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+#include "pcg32.h"
+#include <immintrin.h>
+#include <utility>
+
+#if defined(_MSC_VER)
+#  define PCG32_ALIGN(amt)    __declspec(align(amt))
+#  define PCG32_VECTORCALL    __vectorcall
+#  define PCG32_INLINE        __forceinline
+#else
+#  define PCG32_ALIGN(amt)    __attribute__ ((aligned(amt)))
+#  define PCG32_INLINE        __attribute__ ((always_inline))
+#  if defined(__clang__)
+#    define PCG32_VECTORCALL  __attribute__ ((vectorcall))
+#  else
+#    define PCG32_VECTORCALL
+#  endif
+#endif
+
+/// 8 parallel PCG32 pseudorandom number generators
+struct PCG32_ALIGN(32) pcg32_8 {
+
+#if defined(__AVX2__)
+    __m256i state[2]; // RNG state.  All values are possible.
+    __m256i inc[2];   // Controls which RNG sequence (stream) is selected. Must *always* be odd.
+#else
+    /* Scalar fallback */
+    pcg32 rng[8];
+#endif
+
+    /// Initialize the pseudorandom number generator with default seed
+    pcg32_8() {
+        PCG32_ALIGN(32) uint64_t initstate[8] = {
+            PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE,
+            PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE,
+            PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE,
+            PCG32_DEFAULT_STATE, PCG32_DEFAULT_STATE
+        };
+
+        PCG32_ALIGN(32) uint64_t initseq[8] =
+            { 1, 2, 3, 4, 5, 6, 7, 8 };
+
+        seed(initstate, initseq);
+    }
+
+    /// Initialize the pseudorandom number generator with the \ref seed() function
+    pcg32_8(const uint64_t initstate[8], const uint64_t initseq[8]) {
+        seed(initstate, initseq);
+    }
+
+
+#if defined(__AVX2__)
+    /**
+     * \brief Seed the pseudorandom number generator
+     *
+     * Specified in two parts: a state initializer and a sequence selection
+     * constant (a.k.a. stream id)
+     */
+    void seed(const uint64_t initstate[8], const uint64_t initseq[8]) {
+        const __m256i one = _mm256_set1_epi64x((long long) 1);
+
+        state[0] = state[1] = _mm256_setzero_si256();
+        inc[0] = _mm256_or_si256(
+            _mm256_slli_epi64(_mm256_load_si256((__m256i *) &initseq[0]), 1),
+            one);
+        inc[1] = _mm256_or_si256(
+            _mm256_slli_epi64(_mm256_load_si256((__m256i *) &initseq[4]), 1),
+            one);
+        step();
+
+        state[0] = _mm256_add_epi64(state[0], _mm256_load_si256((__m256i *) &initstate[0]));
+        state[1] = _mm256_add_epi64(state[1], _mm256_load_si256((__m256i *) &initstate[4]));
+
+        step();
+    }
+
+    /// Generate 8 uniformly distributed unsigned 32-bit random numbers
+    void nextUInt(uint32_t result[8]) {
+        _mm256_store_si256((__m256i *) result, step());
+    }
+
+    /// Generate 8 uniformly distributed unsigned 32-bit random numbers
+    __m256i PCG32_VECTORCALL nextUInt() {
+        return step();
+    }
+
+    /// Generate eight single precision floating point value on the interval [0, 1)
+    __m256 PCG32_VECTORCALL nextFloat() {
+        /* Trick from MTGP: generate an uniformly distributed
+           single precision number in [1,2) and subtract 1. */
+
+        const __m256i const1 = _mm256_set1_epi32((int) 0x3f800000u);
+
+        __m256i value = step();
+        __m256i fltval = _mm256_or_si256(_mm256_srli_epi32(value, 9), const1);
+
+        return _mm256_sub_ps(_mm256_castsi256_ps(fltval),
+                             _mm256_castsi256_ps(const1));
+    }
+
+    /// Generate eight single precision floating point value on the interval [0, 1)
+    void nextFloat(float result[8]) {
+        _mm256_store_ps(result, nextFloat());
+    }
+
+    /**
+     * \brief Generate eight double precision floating point value on the interval [0, 1)
+     *
+     * \remark Since the underlying random number generator produces 32 bit output,
+     * only the first 32 mantissa bits will be filled (however, the resolution is still
+     * finer than in \ref nextFloat(), which only uses 23 mantissa bits)
+     */
+    std::pair<__m256d, __m256d> nextDouble() {
+        /* Trick from MTGP: generate an uniformly distributed
+           double precision number in [1,2) and subtract 1. */
+
+        const __m256i const1 =
+            _mm256_set1_epi64x((long long) 0x3ff0000000000000ull);
+
+        __m256i value = step();
+
+        __m256i lo = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(value));
+        __m256i hi = _mm256_cvtepu32_epi64(_mm256_extractf128_si256(value, 1));
+
+        __m256i tlo = _mm256_or_si256(_mm256_slli_epi64(lo, 20), const1);
+        __m256i thi = _mm256_or_si256(_mm256_slli_epi64(hi, 20), const1);
+
+        __m256d flo = _mm256_sub_pd(_mm256_castsi256_pd(tlo),
+                                    _mm256_castsi256_pd(const1));
+
+        __m256d fhi = _mm256_sub_pd(_mm256_castsi256_pd(thi),
+                                    _mm256_castsi256_pd(const1));
+
+        return std::make_pair(flo, fhi);
+    }
+
+    /**
+     * \brief Generate eight double precision floating point value on the interval [0, 1)
+     *
+     * \remark Since the underlying random number generator produces 32 bit output,
+     * only the first 32 mantissa bits will be filled (however, the resolution is still
+     * finer than in \ref nextFloat(), which only uses 23 mantissa bits)
+     */
+    void nextDouble(double result[8]) {
+        std::pair<__m256d, __m256d> value = nextDouble();
+
+        _mm256_store_pd(&result[0], value.first);
+        _mm256_store_pd(&result[4], value.second);
+    }
+
+private:
+    PCG32_INLINE __m256i PCG32_VECTORCALL step() {
+        const __m256i pcg32_mult_l = _mm256_set1_epi64x((long long) (PCG32_MULT & 0xffffffffu));
+        const __m256i pcg32_mult_h = _mm256_set1_epi64x((long long) (PCG32_MULT >> 32));
+        const __m256i mask_l       = _mm256_set1_epi64x((long long) 0x00000000ffffffffull);
+        const __m256i shift0       = _mm256_set_epi32(7, 7, 7, 7, 6, 4, 2, 0);
+        const __m256i shift1       = _mm256_set_epi32(6, 4, 2, 0, 7, 7, 7, 7);
+        const __m256i const32      = _mm256_set1_epi32(32);
+
+        __m256i s0 = state[0], s1 = state[1];
+
+        /* Extract low and high words for partial products below */
+        __m256i s0_l = _mm256_and_si256(s0, mask_l);
+        __m256i s0_h = _mm256_srli_epi64(s0, 32);
+        __m256i s1_l = _mm256_and_si256(s1, mask_l);
+        __m256i s1_h = _mm256_srli_epi64(s1, 32);
+
+        /* Improve high bits using xorshift step */
+        __m256i s0s   = _mm256_srli_epi64(s0, 18);
+        __m256i s1s   = _mm256_srli_epi64(s1, 18);
+
+        __m256i s0x   = _mm256_xor_si256(s0s, s0);
+        __m256i s1x   = _mm256_xor_si256(s1s, s1);
+
+        __m256i s0xs  = _mm256_srli_epi64(s0x, 27);
+        __m256i s1xs  = _mm256_srli_epi64(s1x, 27);
+
+        __m256i xors0 = _mm256_and_si256(mask_l, s0xs);
+        __m256i xors1 = _mm256_and_si256(mask_l, s1xs);
+
+        /* Use high bits to choose a bit-level rotation */
+        __m256i rot0  = _mm256_srli_epi64(s0, 59);
+        __m256i rot1  = _mm256_srli_epi64(s1, 59);
+
+        /* 64 bit multiplication using 32 bit partial products :( */
+        __m256i m0_hl = _mm256_mul_epu32(s0_h, pcg32_mult_l);
+        __m256i m1_hl = _mm256_mul_epu32(s1_h, pcg32_mult_l);
+        __m256i m0_lh = _mm256_mul_epu32(s0_l, pcg32_mult_h);
+        __m256i m1_lh = _mm256_mul_epu32(s1_l, pcg32_mult_h);
+
+        /* Assemble lower 32 bits, will be merged into one 256 bit vector below */
+        xors0 = _mm256_permutevar8x32_epi32(xors0, shift0);
+        rot0  = _mm256_permutevar8x32_epi32(rot0, shift0);
+        xors1 = _mm256_permutevar8x32_epi32(xors1, shift1);
+        rot1  = _mm256_permutevar8x32_epi32(rot1, shift1);
+
+        /* Continue with partial products */
+        __m256i m0_ll = _mm256_mul_epu32(s0_l, pcg32_mult_l);
+        __m256i m1_ll = _mm256_mul_epu32(s1_l, pcg32_mult_l);
+
+        __m256i m0h   = _mm256_add_epi64(m0_hl, m0_lh);
+        __m256i m1h   = _mm256_add_epi64(m1_hl, m1_lh);
+
+        __m256i m0hs  = _mm256_slli_epi64(m0h, 32);
+        __m256i m1hs  = _mm256_slli_epi64(m1h, 32);
+
+        __m256i s0n   = _mm256_add_epi64(m0hs, m0_ll);
+        __m256i s1n   = _mm256_add_epi64(m1hs, m1_ll);
+
+        __m256i xors  = _mm256_or_si256(xors0, xors1);
+        __m256i rot   = _mm256_or_si256(rot0, rot1);
+
+        state[0] = _mm256_add_epi64(s0n, inc[0]);
+        state[1] = _mm256_add_epi64(s1n, inc[1]);
+
+        /* Finally, rotate and return the result */
+        __m256i result = _mm256_or_si256(
+            _mm256_srlv_epi32(xors, rot),
+            _mm256_sllv_epi32(xors, _mm256_sub_epi32(const32, rot))
+        );
+
+        return result;
+    }
+#else
+    /**
+     * \brief Seed the pseudorandom number generator
+     *
+     * Specified in two parts: a state initializer and a sequence selection
+     * constant (a.k.a. stream id)
+     */
+    void seed(const uint64_t initstate[8], const uint64_t initseq[8]) {
+        for (int i = 0; i < 8; ++i)
+            rng[i].seed(initstate[i], initseq[i]);
+    }
+
+    /// Generate 8 uniformly distributed unsigned 32-bit random numbers
+    void nextUInt(uint32_t result[8]) {
+        for (int i = 0; i < 8; ++i)
+            result[i] = rng[i].nextUInt();
+    }
+
+    /// Generate eight single precision floating point value on the interval [0, 1)
+    void nextFloat(float result[8]) {
+        for (int i = 0; i < 8; ++i)
+            result[i] = rng[i].nextFloat();
+    }
+
+    /**
+     * \brief Generate eight double precision floating point value on the interval [0, 1)
+     *
+     * \remark Since the underlying random number generator produces 32 bit output,
+     * only the first 32 mantissa bits will be filled (however, the resolution is still
+     * finer than in \ref nextFloat(), which only uses 23 mantissa bits)
+     */
+    void nextDouble(double result[8]) {
+        for (int i = 0; i < 8; ++i)
+            result[i] = rng[i].nextDouble();
+    }
+#endif
+};