Add SHAKE-128 vectorized 4x implementation and use in Frodo

2025-10-15 00:01:37 -04:00 · 2019-07-08 21:10:04 -04:00 · 2019-07-08 21:10:04 -04:00 · cdcedc6ff3
commit cdcedc6ff3
parent 5f383ebe56
10 changed files with 1083 additions and 4 deletions
--- a/src/crypto/sha3/Makefile.am
+++ b/src/crypto/sha3/Makefile.am
@ -5,5 +5,8 @@ libsha3_la_SOURCES = sha3_c.c
 if USE_SHA3_OPENSSL
 libsha3_la_SOURCES += sha3_ossl.c
 endif
+if USE_AVX2_INSTRUCTIONS
+libsha3_la_SOURCES += sha3x4.c
+endif

 sha3_c.c: fips202.c sp800-185.c
--- a/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c
+++ b/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c
@ -0,0 +1,427 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+/* Simplified for liboqs: remove code that wasn't used in shake128_4x */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "align.h"
+#include "KeccakP-1600-times4-SnP.h"
+#include "SIMD256-config.h"
+
+#include "brg_endian.h"
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+#error Expecting a little-endian platform
+#endif
+
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+typedef __m128i V128;
+typedef __m256i V256;
+
+#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
+
+#if defined(KeccakP1600times4_useAVX2)
+    #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
+    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
+    #define CONST256_64(a)          (V256)_mm256_broadcast_sd((const double*)(&a))
+    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
+    #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
+    #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
+    #define ROL64in256_8(d, a)      d = _mm256_shuffle_epi8(a, CONST256(rho8))
+    #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
+static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
+static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
+    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
+    #define STORE2_128(ah, al, v)   _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
+    #define XOR256(a, b)            _mm256_xor_si256(a, b)
+    #define XOReq256(a, b)          a = _mm256_xor_si256(a, b)
+    #define UNPACKL( a, b )         _mm256_unpacklo_epi64((a), (b))
+    #define UNPACKH( a, b )         _mm256_unpackhi_epi64((a), (b))
+    #define PERM128( a, b, c )      (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
+    #define SHUFFLE64( a, b, c )    (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
+
+    #define UNINTLEAVE()            lanesL01 = UNPACKL( lanes0, lanes1 ),                   \
+                                    lanesH01 = UNPACKH( lanes0, lanes1 ),                   \
+                                    lanesL23 = UNPACKL( lanes2, lanes3 ),                   \
+                                    lanesH23 = UNPACKH( lanes2, lanes3 ),                   \
+                                    lanes0 = PERM128( lanesL01, lanesL23, 0x20 ),           \
+                                    lanes2 = PERM128( lanesL01, lanesL23, 0x31 ),           \
+                                    lanes1 = PERM128( lanesH01, lanesH23, 0x20 ),           \
+                                    lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
+
+    #define INTLEAVE()              lanesL01 = PERM128( lanes0, lanes2, 0x20 ),             \
+                                    lanesH01 = PERM128( lanes1, lanes3, 0x20 ),             \
+                                    lanesL23 = PERM128( lanes0, lanes2, 0x31 ),             \
+                                    lanesH23 = PERM128( lanes1, lanes3, 0x31 ),             \
+                                    lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ),         \
+                                    lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ),         \
+                                    lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ),         \
+                                    lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
+
+#endif
+
+#define SnP_laneLengthInBytes 8
+
+#define declareABCDE \
+    V256 Aba, Abe, Abi, Abo, Abu; \
+    V256 Aga, Age, Agi, Ago, Agu; \
+    V256 Aka, Ake, Aki, Ako, Aku; \
+    V256 Ama, Ame, Ami, Amo, Amu; \
+    V256 Asa, Ase, Asi, Aso, Asu; \
+    V256 Bba, Bbe, Bbi, Bbo, Bbu; \
+    V256 Bga, Bge, Bgi, Bgo, Bgu; \
+    V256 Bka, Bke, Bki, Bko, Bku; \
+    V256 Bma, Bme, Bmi, Bmo, Bmu; \
+    V256 Bsa, Bse, Bsi, Bso, Bsu; \
+    V256 Ca, Ce, Ci, Co, Cu; \
+    V256 Ca1, Ce1, Ci1, Co1, Cu1; \
+    V256 Da, De, Di, Do, Du; \
+    V256 Eba, Ebe, Ebi, Ebo, Ebu; \
+    V256 Ega, Ege, Egi, Ego, Egu; \
+    V256 Eka, Eke, Eki, Eko, Eku; \
+    V256 Ema, Eme, Emi, Emo, Emu; \
+    V256 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
+    Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
+    Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
+    Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
+    Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
+
+/* --- Theta Rho Pi Chi Iota Prepare-theta */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    ROL64in256(Ce1, Ce, 1); \
+    Da = XOR256(Cu, Ce1); \
+    ROL64in256(Ci1, Ci, 1); \
+    De = XOR256(Ca, Ci1); \
+    ROL64in256(Co1, Co, 1); \
+    Di = XOR256(Ce, Co1); \
+    ROL64in256(Cu1, Cu, 1); \
+    Do = XOR256(Ci, Cu1); \
+    ROL64in256(Ca1, Ca, 1); \
+    Du = XOR256(Co, Ca1); \
+\
+    XOReq256(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq256(A##ge, De); \
+    ROL64in256(Bbe, A##ge, 44); \
+    XOReq256(A##ki, Di); \
+    ROL64in256(Bbi, A##ki, 43); \
+    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+    XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
+    Ca = E##ba; \
+    XOReq256(A##mo, Do); \
+    ROL64in256(Bbo, A##mo, 21); \
+    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+    Ce = E##be; \
+    XOReq256(A##su, Du); \
+    ROL64in256(Bbu, A##su, 14); \
+    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+    Ci = E##bi; \
+    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+    Co = E##bo; \
+    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+    Cu = E##bu; \
+\
+    XOReq256(A##bo, Do); \
+    ROL64in256(Bga, A##bo, 28); \
+    XOReq256(A##gu, Du); \
+    ROL64in256(Bge, A##gu, 20); \
+    XOReq256(A##ka, Da); \
+    ROL64in256(Bgi, A##ka, 3); \
+    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+    XOReq256(Ca, E##ga); \
+    XOReq256(A##me, De); \
+    ROL64in256(Bgo, A##me, 45); \
+    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+    XOReq256(Ce, E##ge); \
+    XOReq256(A##si, Di); \
+    ROL64in256(Bgu, A##si, 61); \
+    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+    XOReq256(Ci, E##gi); \
+    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+    XOReq256(Co, E##go); \
+    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+    XOReq256(Cu, E##gu); \
+\
+    XOReq256(A##be, De); \
+    ROL64in256(Bka, A##be, 1); \
+    XOReq256(A##gi, Di); \
+    ROL64in256(Bke, A##gi, 6); \
+    XOReq256(A##ko, Do); \
+    ROL64in256(Bki, A##ko, 25); \
+    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+    XOReq256(Ca, E##ka); \
+    XOReq256(A##mu, Du); \
+    ROL64in256_8(Bko, A##mu); \
+    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+    XOReq256(Ce, E##ke); \
+    XOReq256(A##sa, Da); \
+    ROL64in256(Bku, A##sa, 18); \
+    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+    XOReq256(Ci, E##ki); \
+    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+    XOReq256(Co, E##ko); \
+    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+    XOReq256(Cu, E##ku); \
+\
+    XOReq256(A##bu, Du); \
+    ROL64in256(Bma, A##bu, 27); \
+    XOReq256(A##ga, Da); \
+    ROL64in256(Bme, A##ga, 36); \
+    XOReq256(A##ke, De); \
+    ROL64in256(Bmi, A##ke, 10); \
+    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+    XOReq256(Ca, E##ma); \
+    XOReq256(A##mi, Di); \
+    ROL64in256(Bmo, A##mi, 15); \
+    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+    XOReq256(Ce, E##me); \
+    XOReq256(A##so, Do); \
+    ROL64in256_56(Bmu, A##so); \
+    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+    XOReq256(Ci, E##mi); \
+    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+    XOReq256(Co, E##mo); \
+    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+    XOReq256(Cu, E##mu); \
+\
+    XOReq256(A##bi, Di); \
+    ROL64in256(Bsa, A##bi, 62); \
+    XOReq256(A##go, Do); \
+    ROL64in256(Bse, A##go, 55); \
+    XOReq256(A##ku, Du); \
+    ROL64in256(Bsi, A##ku, 39); \
+    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+    XOReq256(Ca, E##sa); \
+    XOReq256(A##ma, Da); \
+    ROL64in256(Bso, A##ma, 41); \
+    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+    XOReq256(Ce, E##se); \
+    XOReq256(A##se, De); \
+    ROL64in256(Bsu, A##se, 2); \
+    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+    XOReq256(Ci, E##si); \
+    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+    XOReq256(Co, E##so); \
+    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+    XOReq256(Cu, E##su); \
+\
+
+/* --- Theta Rho Pi Chi Iota */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    ROL64in256(Ce1, Ce, 1); \
+    Da = XOR256(Cu, Ce1); \
+    ROL64in256(Ci1, Ci, 1); \
+    De = XOR256(Ca, Ci1); \
+    ROL64in256(Co1, Co, 1); \
+    Di = XOR256(Ce, Co1); \
+    ROL64in256(Cu1, Cu, 1); \
+    Do = XOR256(Ci, Cu1); \
+    ROL64in256(Ca1, Ca, 1); \
+    Du = XOR256(Co, Ca1); \
+\
+    XOReq256(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq256(A##ge, De); \
+    ROL64in256(Bbe, A##ge, 44); \
+    XOReq256(A##ki, Di); \
+    ROL64in256(Bbi, A##ki, 43); \
+    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+    XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
+    XOReq256(A##mo, Do); \
+    ROL64in256(Bbo, A##mo, 21); \
+    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+    XOReq256(A##su, Du); \
+    ROL64in256(Bbu, A##su, 14); \
+    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+\
+    XOReq256(A##bo, Do); \
+    ROL64in256(Bga, A##bo, 28); \
+    XOReq256(A##gu, Du); \
+    ROL64in256(Bge, A##gu, 20); \
+    XOReq256(A##ka, Da); \
+    ROL64in256(Bgi, A##ka, 3); \
+    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+    XOReq256(A##me, De); \
+    ROL64in256(Bgo, A##me, 45); \
+    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+    XOReq256(A##si, Di); \
+    ROL64in256(Bgu, A##si, 61); \
+    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+\
+    XOReq256(A##be, De); \
+    ROL64in256(Bka, A##be, 1); \
+    XOReq256(A##gi, Di); \
+    ROL64in256(Bke, A##gi, 6); \
+    XOReq256(A##ko, Do); \
+    ROL64in256(Bki, A##ko, 25); \
+    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+    XOReq256(A##mu, Du); \
+    ROL64in256_8(Bko, A##mu); \
+    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+    XOReq256(A##sa, Da); \
+    ROL64in256(Bku, A##sa, 18); \
+    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+\
+    XOReq256(A##bu, Du); \
+    ROL64in256(Bma, A##bu, 27); \
+    XOReq256(A##ga, Da); \
+    ROL64in256(Bme, A##ga, 36); \
+    XOReq256(A##ke, De); \
+    ROL64in256(Bmi, A##ke, 10); \
+    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+    XOReq256(A##mi, Di); \
+    ROL64in256(Bmo, A##mi, 15); \
+    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+    XOReq256(A##so, Do); \
+    ROL64in256_56(Bmu, A##so); \
+    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+\
+    XOReq256(A##bi, Di); \
+    ROL64in256(Bsa, A##bi, 62); \
+    XOReq256(A##go, Do); \
+    ROL64in256(Bse, A##go, 55); \
+    XOReq256(A##ku, Du); \
+    ROL64in256(Bsi, A##ku, 39); \
+    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+    XOReq256(A##ma, Da); \
+    ROL64in256(Bso, A##ma, 41); \
+    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+    XOReq256(A##se, De); \
+    ROL64in256(Bsu, A##se, 2); \
+    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+\
+
+static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL};
+
+#define copyFromState(X, state) \
+    X##ba = LOAD256(state[ 0]); \
+    X##be = LOAD256(state[ 1]); \
+    X##bi = LOAD256(state[ 2]); \
+    X##bo = LOAD256(state[ 3]); \
+    X##bu = LOAD256(state[ 4]); \
+    X##ga = LOAD256(state[ 5]); \
+    X##ge = LOAD256(state[ 6]); \
+    X##gi = LOAD256(state[ 7]); \
+    X##go = LOAD256(state[ 8]); \
+    X##gu = LOAD256(state[ 9]); \
+    X##ka = LOAD256(state[10]); \
+    X##ke = LOAD256(state[11]); \
+    X##ki = LOAD256(state[12]); \
+    X##ko = LOAD256(state[13]); \
+    X##ku = LOAD256(state[14]); \
+    X##ma = LOAD256(state[15]); \
+    X##me = LOAD256(state[16]); \
+    X##mi = LOAD256(state[17]); \
+    X##mo = LOAD256(state[18]); \
+    X##mu = LOAD256(state[19]); \
+    X##sa = LOAD256(state[20]); \
+    X##se = LOAD256(state[21]); \
+    X##si = LOAD256(state[22]); \
+    X##so = LOAD256(state[23]); \
+    X##su = LOAD256(state[24]); \
+
+#define copyToState(state, X) \
+    STORE256(state[ 0], X##ba); \
+    STORE256(state[ 1], X##be); \
+    STORE256(state[ 2], X##bi); \
+    STORE256(state[ 3], X##bo); \
+    STORE256(state[ 4], X##bu); \
+    STORE256(state[ 5], X##ga); \
+    STORE256(state[ 6], X##ge); \
+    STORE256(state[ 7], X##gi); \
+    STORE256(state[ 8], X##go); \
+    STORE256(state[ 9], X##gu); \
+    STORE256(state[10], X##ka); \
+    STORE256(state[11], X##ke); \
+    STORE256(state[12], X##ki); \
+    STORE256(state[13], X##ko); \
+    STORE256(state[14], X##ku); \
+    STORE256(state[15], X##ma); \
+    STORE256(state[16], X##me); \
+    STORE256(state[17], X##mi); \
+    STORE256(state[18], X##mo); \
+    STORE256(state[19], X##mu); \
+    STORE256(state[20], X##sa); \
+    STORE256(state[21], X##se); \
+    STORE256(state[22], X##si); \
+    STORE256(state[23], X##so); \
+    STORE256(state[24], X##su); \
+
+ #ifdef KeccakP1600times4_fullUnrolling
+#define FullUnrolling
+#else
+#define Unrolling KeccakP1600times4_unrolling
+#endif
+#include "KeccakP-1600-unrolling.macros"
+
+static void KeccakP1600times4_PermuteAll_24rounds(void *states)
+{
+    V256 *statesAsLanes = (V256 *)states;
+    declareABCDE
+    #ifndef KeccakP1600times4_fullUnrolling
+    unsigned int i;
+    #endif
+
+    copyFromState(A, statesAsLanes)
+    rounds24
+    copyToState(statesAsLanes, A)
+}
--- a/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SnP.h
+++ b/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SnP.h
@ -0,0 +1,39 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+/* Simplified for liboqs: remove code that wasn't used in shake128_4x */
+
+#ifndef _KeccakP_1600_times4_SnP_h_
+#define _KeccakP_1600_times4_SnP_h_
+
+/** For the documentation, see PlSnP-documentation.h.
+ */
+
+#include "SIMD256-config.h"
+
+#define KeccakP1600times4_implementation        "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
+#define KeccakP1600times4_statesSizeInBytes     800
+#define KeccakP1600times4_statesAlignment       32
+#define KeccakF1600times4_FastLoop_supported
+#define KeccakP1600times4_12rounds_FastLoop_supported
+
+#include <stddef.h>
+
+#define KeccakP1600times4_StaticInitialize()
+#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
+    ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
+static void KeccakP1600times4_PermuteAll_24rounds(void *states);
+
+#endif
--- a/src/crypto/sha3/keccak4x/KeccakP-1600-unrolling.macros
+++ b/src/crypto/sha3/keccak4x/KeccakP-1600-unrolling.macros
@ -0,0 +1,198 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (defined(FullUnrolling))
+#define rounds24 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 12)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=12) { \
+        thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 6)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#elif (Unrolling == 4)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#elif (Unrolling == 3)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#elif (Unrolling == 2)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#elif (Unrolling == 1)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#else
+#error "Unrolling is not correctly specified!"
+#endif
+
+#define roundsN(__nrounds) \
+    prepareTheta \
+    i = 24 - (__nrounds); \
+    if ((i&1) != 0) { \
+        thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+        copyStateVariables(A, E) \
+        ++i; \
+    } \
+    for( /* empty */; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    }
--- a/src/crypto/sha3/keccak4x/SIMD256-config.h
+++ b/src/crypto/sha3/keccak4x/SIMD256-config.h
@ -0,0 +1,3 @@
+#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
+#define KeccakP1600times4_fullUnrolling
+#define KeccakP1600times4_useAVX2
--- a/src/crypto/sha3/keccak4x/align.h
+++ b/src/crypto/sha3/keccak4x/align.h
@ -0,0 +1,34 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _align_h_
+#define _align_h_
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
+
+#endif
--- a/src/crypto/sha3/keccak4x/brg_endian.h
+++ b/src/crypto/sha3/keccak4x/brg_endian.h
@ -0,0 +1,142 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
--- a/src/crypto/sha3/sha3.h
+++ b/src/crypto/sha3/sha3.h
@ -499,6 +499,28 @@ void OQS_SHA3_cshake128_simple(uint8_t *output, size_t outlen, uint16_t cstm, co
 */
 void OQS_SHA3_cshake256_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *input, size_t inplen);

+#ifdef USE_AVX2_INSTRUCTIONS
+/**
+ * \brief Seed 4 parallel SHAKE-128 instances, and generate 4 arrays of pseudo-random bytes.
+ *
+ * Uses a vectorized (AVX2) implementation of SHAKE-128.
+ *
+ * \warning The output array length must not be zero.
+ *
+ * \param output0 The first output byte array
+ * \param output0 The second output byte array
+ * \param output0 The third output byte array
+ * \param output0 The fourth output byte array
+ * \param outlen The number of output bytes to generate in every output array
+ * \param in0 The first input seed byte array
+ * \param in1 The second input seed byte array
+ * \param in2 The third input seed byte array
+ * \param in3 The fourth input seed byte array
+ * \param inlen The number of seed bytes to process from every input array
+ */
+void OQS_SHA3_shake128_4x(uint8_t *output0, uint8_t *output1, uint8_t *output2, uint8_t *output3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen);
+#endif
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
--- a/src/crypto/sha3/sha3x4.c
+++ b/src/crypto/sha3/sha3x4.c
@ -0,0 +1,170 @@
+#include <immintrin.h>
+#include <stdint.h>
+#include <assert.h>
+
+#define SHAKE128_RATE 168
+
+#define NROUNDS 24
+#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
+
+
+static uint64_t load64(const unsigned char *x)
+{
+  unsigned long long r = 0, i;
+
+  for (i = 0; i < 8; ++i) {
+    r |= (unsigned long long)x[i] << 8 * i;
+  }
+  return r;
+}
+
+
+static void store64(uint8_t *x, uint64_t u)
+{
+  unsigned int i;
+
+  for (i = 0; i < 8; ++i) {
+    x[i] = u;
+    u >>= 8;
+  }
+}
+
+
+/* Use implementation from the Keccak Code Package */
+#include "keccak4x/KeccakP-1600-times4-SIMD256.c"
+#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
+
+
+static void keccak_absorb4x(__m256i *s, unsigned int r, const unsigned char *m0, const unsigned char *m1, const unsigned char *m2, const unsigned char *m3, 
+                            unsigned long long int mlen, unsigned char p)
+{
+  unsigned long long i;
+  unsigned char t0[200];
+  unsigned char t1[200];
+  unsigned char t2[200];
+  unsigned char t3[200];
+  unsigned long long *ss = (unsigned long long *)s;
+   
+  while (mlen >= r) 
+  {
+    for (i = 0; i < r / 8; ++i)
+    {
+      ss[4*i+0] ^= load64(m0 + 8 * i);
+      ss[4*i+1] ^= load64(m1 + 8 * i);
+      ss[4*i+2] ^= load64(m2 + 8 * i);
+      ss[4*i+3] ^= load64(m3 + 8 * i);
+    }
+    
+    KeccakF1600_StatePermute4x(s);
+    mlen -= r;
+    m0 += r;
+    m1 += r;
+    m2 += r;
+    m3 += r;
+  }
+
+  for (i = 0; i < r; ++i)
+  {
+    t0[i] = 0;
+    t1[i] = 0;
+    t2[i] = 0;
+    t3[i] = 0;
+  }
+  for (i = 0; i < mlen; ++i)
+  {
+    t0[i] = m0[i];
+    t1[i] = m1[i];
+    t2[i] = m2[i];
+    t3[i] = m3[i];
+  }
+
+  t0[i] = p;
+  t1[i] = p;
+  t2[i] = p;
+  t3[i] = p;
+
+  t0[r - 1] |= 128;
+  t1[r - 1] |= 128;
+  t2[r - 1] |= 128;
+  t3[r - 1] |= 128;
+
+  for (i = 0; i < r / 8; ++i)
+  {
+    ss[4*i+0] ^= load64(t0 + 8 * i);
+    ss[4*i+1] ^= load64(t1 + 8 * i);
+    ss[4*i+2] ^= load64(t2 + 8 * i);
+    ss[4*i+3] ^= load64(t3 + 8 * i);
+  }
+}
+
+
+static void keccak_squeezeblocks4x(unsigned char *h0, unsigned char *h1, unsigned char *h2, unsigned char *h3, unsigned long long int nblocks, __m256i *s, unsigned int r)
+{
+  unsigned int i;
+  unsigned long long *ss = (unsigned long long *)s;
+
+  while (nblocks > 0) 
+  {
+    KeccakF1600_StatePermute4x(s);
+    for (i = 0; i < (r>>3); i++)
+    {
+      store64(h0+8*i, ss[4*i+0]);
+      store64(h1+8*i, ss[4*i+1]);
+      store64(h2+8*i, ss[4*i+2]);
+      store64(h3+8*i, ss[4*i+3]);
+    }
+    h0 += r;
+    h1 += r;
+    h2 += r;
+    h3 += r;
+    nblocks--;
+  }
+}
+
+
+/********** SHAKE128 ***********/
+
+static void shake128_absorb4x(__m256i *s, const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen)
+{
+  unsigned int i;
+
+  for (i = 0; i < 25; i++)
+    s[i] = _mm256_xor_si256(s[i], s[i]); // zero state
+
+  /* Absorb input */
+  keccak_absorb4x(s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
+}
+
+
+/* N is assumed to be empty; S is assumed to have at most 2 characters */
+void OQS_SHA3_shake128_4x(unsigned char *output0, unsigned char *output1, unsigned char *output2, unsigned char *output3, unsigned long long outlen, 
+                 const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen)
+{
+  __m256i s[25];
+  unsigned char t0[SHAKE128_RATE];
+  unsigned char t1[SHAKE128_RATE];
+  unsigned char t2[SHAKE128_RATE];
+  unsigned char t3[SHAKE128_RATE];
+  unsigned int i;
+
+  shake128_absorb4x(s, in0, in1, in2, in3, inlen);
+
+  /* Squeeze output */
+  keccak_squeezeblocks4x(output0, output1, output2, output3, outlen/SHAKE128_RATE, s, SHAKE128_RATE);
+  output0 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
+  output1 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
+  output2 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
+  output3 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
+
+  if (outlen%SHAKE128_RATE)
+  {
+    keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE128_RATE);
+    for (i = 0; i < outlen%SHAKE128_RATE; i++)
+    {
+      output0[i] = t0[i];
+      output1[i] = t1[i];
+      output2[i] = t2[i];
+      output3[i] = t3[i];
+    }
+  }
+}
--- a/src/kem/frodokem/external/frodo_macrify_optimized.c
+++ b/src/kem/frodokem/external/frodo_macrify_optimized.c
@ -50,7 +50,8 @@ int frodo_mul_add_as_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
        }

        OQS_AES128_ECB_enc_sch((uint8_t*)a_row_temp, 4*PARAMS_N*sizeof(int16_t), aes_key_schedule, (uint8_t*)a_row);
-#elif defined (USE_SHAKE128_FOR_A)       
+#elif defined (USE_SHAKE128_FOR_A)
+#ifndef USE_AVX2_INSTRUCTIONS
    uint8_t seed_A_separated[2 + BYTES_SEED_A];
    uint16_t* seed_A_origin = (uint16_t*)&seed_A_separated;
    memcpy(&seed_A_separated[2], seed_A, BYTES_SEED_A);
@ -63,6 +64,27 @@ int frodo_mul_add_as_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
        OQS_SHA3_shake128((unsigned char*)(a_row + 2*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A);
        seed_A_origin[0] = UINT16_TO_LE(i + 3);
        OQS_SHA3_shake128((unsigned char*)(a_row + 3*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A);
+#else
+    uint8_t seed_A_separated_0[2 + BYTES_SEED_A];
+    uint8_t seed_A_separated_1[2 + BYTES_SEED_A];
+    uint8_t seed_A_separated_2[2 + BYTES_SEED_A];
+    uint8_t seed_A_separated_3[2 + BYTES_SEED_A];
+    uint16_t* seed_A_origin_0 = (uint16_t*)&seed_A_separated_0;
+    uint16_t* seed_A_origin_1 = (uint16_t*)&seed_A_separated_1;
+    uint16_t* seed_A_origin_2 = (uint16_t*)&seed_A_separated_2;
+    uint16_t* seed_A_origin_3 = (uint16_t*)&seed_A_separated_3;
+    memcpy(&seed_A_separated_0[2], seed_A, BYTES_SEED_A);
+    memcpy(&seed_A_separated_1[2], seed_A, BYTES_SEED_A);
+    memcpy(&seed_A_separated_2[2], seed_A, BYTES_SEED_A);
+    memcpy(&seed_A_separated_3[2], seed_A, BYTES_SEED_A);
+    for (i = 0; i < PARAMS_N; i += 4) {
+        seed_A_origin_0[0] = UINT16_TO_LE(i + 0);
+        seed_A_origin_1[0] = UINT16_TO_LE(i + 1);
+        seed_A_origin_2[0] = UINT16_TO_LE(i + 2);
+        seed_A_origin_3[0] = UINT16_TO_LE(i + 3);
+        OQS_SHA3_shake128_4x((unsigned char*)(a_row), (unsigned char*)(a_row + PARAMS_N), (unsigned char*)(a_row + 2*PARAMS_N), (unsigned char*)(a_row + 3*PARAMS_N), 
+                    (unsigned long long)(2*PARAMS_N), seed_A_separated_0, seed_A_separated_1, seed_A_separated_2, seed_A_separated_3, 2 + BYTES_SEED_A);
+#endif
 #endif
        for (k = 0; k < 4 * PARAMS_N; k++) {
            a_row[k] = LE_TO_UINT16(a_row[k]);
@ -184,6 +206,7 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
    int t=0;
    ALIGN_HEADER(32) uint16_t a_cols[4*PARAMS_N] ALIGN_FOOTER(32) = {0};

+#ifndef USE_AVX2_INSTRUCTIONS
    int k;
    uint8_t seed_A_separated[2 + BYTES_SEED_A];
    uint16_t* seed_A_origin = (uint16_t*)&seed_A_separated;
@ -201,7 +224,6 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
            a_cols[i] = LE_TO_UINT16(a_cols[i]);
        }

-#ifndef USE_AVX2_INSTRUCTIONS
        for (i = 0; i < PARAMS_NBAR; i++) {
            uint16_t sum[PARAMS_N] = {0};
            for (j = 0; j < 4; j++) {
@ -215,6 +237,25 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
            }
        }
 #else
+    uint8_t seed_A_separated_0[2 + BYTES_SEED_A];
+    uint8_t seed_A_separated_1[2 + BYTES_SEED_A];
+    uint8_t seed_A_separated_2[2 + BYTES_SEED_A];
+    uint8_t seed_A_separated_3[2 + BYTES_SEED_A];
+    uint16_t* seed_A_origin_0 = (uint16_t*)&seed_A_separated_0;
+    uint16_t* seed_A_origin_1 = (uint16_t*)&seed_A_separated_1;
+    uint16_t* seed_A_origin_2 = (uint16_t*)&seed_A_separated_2;
+    uint16_t* seed_A_origin_3 = (uint16_t*)&seed_A_separated_3;
+    memcpy(&seed_A_separated_0[2], seed_A, BYTES_SEED_A);
+    memcpy(&seed_A_separated_1[2], seed_A, BYTES_SEED_A);
+    memcpy(&seed_A_separated_2[2], seed_A, BYTES_SEED_A);
+    memcpy(&seed_A_separated_3[2], seed_A, BYTES_SEED_A);
+    for (kk = 0; kk < PARAMS_N; kk+=4) {
+        seed_A_origin_0[0] = UINT16_TO_LE(kk + 0);
+        seed_A_origin_1[0] = UINT16_TO_LE(kk + 1);
+        seed_A_origin_2[0] = UINT16_TO_LE(kk + 2);
+        seed_A_origin_3[0] = UINT16_TO_LE(kk + 3);
+        OQS_SHA3_shake128_4x((unsigned char*)(a_cols), (unsigned char*)(a_cols + PARAMS_N), (unsigned char*)(a_cols + 2*PARAMS_N), (unsigned char*)(a_cols + 3*PARAMS_N), 
+                    (unsigned long long)(2*PARAMS_N), seed_A_separated_0, seed_A_separated_1, seed_A_separated_2, seed_A_separated_3, 2 + BYTES_SEED_A);
        for (i = 0; i < PARAMS_NBAR; i++) {
            __m256i a, b0, b1, b2, b3, acc[PARAMS_N/16];
            b0 = _mm256_set1_epi16(s[i*PARAMS_N + kk + 0]);       
@ -237,8 +278,8 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
                acc[j/16] = _mm256_add_epi16(a, acc[j/16]);
            }

-            for (k = 0; k < PARAMS_N/16; k++) {
-                _mm256_store_si256((__m256i*)&out[i*PARAMS_N + 16*k], acc[k]);
+            for (j = 0; j < PARAMS_N/16; j++) {
+                _mm256_store_si256((__m256i*)&out[i*PARAMS_N + 16*j], acc[j]);
            }
        }
 #endif