Add SHAKE-128 vectorized 4x implementation and use in Frodo

This commit is contained in:
Douglas Stebila 2019-07-08 21:10:04 -04:00
parent 5f383ebe56
commit cdcedc6ff3
10 changed files with 1083 additions and 4 deletions

View File

@ -5,5 +5,8 @@ libsha3_la_SOURCES = sha3_c.c
if USE_SHA3_OPENSSL
libsha3_la_SOURCES += sha3_ossl.c
endif
if USE_AVX2_INSTRUCTIONS
libsha3_la_SOURCES += sha3x4.c
endif
sha3_c.c: fips202.c sp800-185.c

View File

@ -0,0 +1,427 @@
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
/* Simplified for liboqs: remove code that wasn't used in shake128_4x */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <smmintrin.h>
#include <wmmintrin.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "align.h"
#include "KeccakP-1600-times4-SnP.h"
#include "SIMD256-config.h"
#include "brg_endian.h"
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
#error Expecting a little-endian platform
#endif
typedef unsigned char UINT8;
typedef unsigned long long int UINT64;
typedef __m128i V128;
typedef __m256i V256;
#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
#if defined(KeccakP1600times4_useAVX2)
#define ANDnu256(a, b) _mm256_andnot_si256(a, b)
#define CONST256(a) _mm256_load_si256((const V256 *)&(a))
#define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
#define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
#define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
#define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
#define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
#define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
#define XOR256(a, b) _mm256_xor_si256(a, b)
#define XOReq256(a, b) a = _mm256_xor_si256(a, b)
#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
#define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
#define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
#define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
lanesH01 = UNPACKH( lanes0, lanes1 ), \
lanesL23 = UNPACKL( lanes2, lanes3 ), \
lanesH23 = UNPACKH( lanes2, lanes3 ), \
lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
#define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
#endif
#define SnP_laneLengthInBytes 8
#define declareABCDE \
V256 Aba, Abe, Abi, Abo, Abu; \
V256 Aga, Age, Agi, Ago, Agu; \
V256 Aka, Ake, Aki, Ako, Aku; \
V256 Ama, Ame, Ami, Amo, Amu; \
V256 Asa, Ase, Asi, Aso, Asu; \
V256 Bba, Bbe, Bbi, Bbo, Bbu; \
V256 Bga, Bge, Bgi, Bgo, Bgu; \
V256 Bka, Bke, Bki, Bko, Bku; \
V256 Bma, Bme, Bmi, Bmo, Bmu; \
V256 Bsa, Bse, Bsi, Bso, Bsu; \
V256 Ca, Ce, Ci, Co, Cu; \
V256 Ca1, Ce1, Ci1, Co1, Cu1; \
V256 Da, De, Di, Do, Du; \
V256 Eba, Ebe, Ebi, Ebo, Ebu; \
V256 Ega, Ege, Egi, Ego, Egu; \
V256 Eka, Eke, Eki, Eko, Eku; \
V256 Ema, Eme, Emi, Emo, Emu; \
V256 Esa, Ese, Esi, Eso, Esu; \
#define prepareTheta \
Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
/* --- Theta Rho Pi Chi Iota Prepare-theta */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
ROL64in256(Ce1, Ce, 1); \
Da = XOR256(Cu, Ce1); \
ROL64in256(Ci1, Ci, 1); \
De = XOR256(Ca, Ci1); \
ROL64in256(Co1, Co, 1); \
Di = XOR256(Ce, Co1); \
ROL64in256(Cu1, Cu, 1); \
Do = XOR256(Ci, Cu1); \
ROL64in256(Ca1, Ca, 1); \
Du = XOR256(Co, Ca1); \
\
XOReq256(A##ba, Da); \
Bba = A##ba; \
XOReq256(A##ge, De); \
ROL64in256(Bbe, A##ge, 44); \
XOReq256(A##ki, Di); \
ROL64in256(Bbi, A##ki, 43); \
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
Ca = E##ba; \
XOReq256(A##mo, Do); \
ROL64in256(Bbo, A##mo, 21); \
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
Ce = E##be; \
XOReq256(A##su, Du); \
ROL64in256(Bbu, A##su, 14); \
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
Ci = E##bi; \
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
Co = E##bo; \
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
Cu = E##bu; \
\
XOReq256(A##bo, Do); \
ROL64in256(Bga, A##bo, 28); \
XOReq256(A##gu, Du); \
ROL64in256(Bge, A##gu, 20); \
XOReq256(A##ka, Da); \
ROL64in256(Bgi, A##ka, 3); \
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
XOReq256(Ca, E##ga); \
XOReq256(A##me, De); \
ROL64in256(Bgo, A##me, 45); \
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
XOReq256(Ce, E##ge); \
XOReq256(A##si, Di); \
ROL64in256(Bgu, A##si, 61); \
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
XOReq256(Ci, E##gi); \
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
XOReq256(Co, E##go); \
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
XOReq256(Cu, E##gu); \
\
XOReq256(A##be, De); \
ROL64in256(Bka, A##be, 1); \
XOReq256(A##gi, Di); \
ROL64in256(Bke, A##gi, 6); \
XOReq256(A##ko, Do); \
ROL64in256(Bki, A##ko, 25); \
E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
XOReq256(Ca, E##ka); \
XOReq256(A##mu, Du); \
ROL64in256_8(Bko, A##mu); \
E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
XOReq256(Ce, E##ke); \
XOReq256(A##sa, Da); \
ROL64in256(Bku, A##sa, 18); \
E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
XOReq256(Ci, E##ki); \
E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
XOReq256(Co, E##ko); \
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
XOReq256(Cu, E##ku); \
\
XOReq256(A##bu, Du); \
ROL64in256(Bma, A##bu, 27); \
XOReq256(A##ga, Da); \
ROL64in256(Bme, A##ga, 36); \
XOReq256(A##ke, De); \
ROL64in256(Bmi, A##ke, 10); \
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
XOReq256(Ca, E##ma); \
XOReq256(A##mi, Di); \
ROL64in256(Bmo, A##mi, 15); \
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
XOReq256(Ce, E##me); \
XOReq256(A##so, Do); \
ROL64in256_56(Bmu, A##so); \
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
XOReq256(Ci, E##mi); \
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
XOReq256(Co, E##mo); \
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
XOReq256(Cu, E##mu); \
\
XOReq256(A##bi, Di); \
ROL64in256(Bsa, A##bi, 62); \
XOReq256(A##go, Do); \
ROL64in256(Bse, A##go, 55); \
XOReq256(A##ku, Du); \
ROL64in256(Bsi, A##ku, 39); \
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
XOReq256(Ca, E##sa); \
XOReq256(A##ma, Da); \
ROL64in256(Bso, A##ma, 41); \
E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
XOReq256(Ce, E##se); \
XOReq256(A##se, De); \
ROL64in256(Bsu, A##se, 2); \
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
XOReq256(Ci, E##si); \
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
XOReq256(Co, E##so); \
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
XOReq256(Cu, E##su); \
\
/* --- Theta Rho Pi Chi Iota */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIota(i, A, E) \
ROL64in256(Ce1, Ce, 1); \
Da = XOR256(Cu, Ce1); \
ROL64in256(Ci1, Ci, 1); \
De = XOR256(Ca, Ci1); \
ROL64in256(Co1, Co, 1); \
Di = XOR256(Ce, Co1); \
ROL64in256(Cu1, Cu, 1); \
Do = XOR256(Ci, Cu1); \
ROL64in256(Ca1, Ca, 1); \
Du = XOR256(Co, Ca1); \
\
XOReq256(A##ba, Da); \
Bba = A##ba; \
XOReq256(A##ge, De); \
ROL64in256(Bbe, A##ge, 44); \
XOReq256(A##ki, Di); \
ROL64in256(Bbi, A##ki, 43); \
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
XOReq256(A##mo, Do); \
ROL64in256(Bbo, A##mo, 21); \
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
XOReq256(A##su, Du); \
ROL64in256(Bbu, A##su, 14); \
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
\
XOReq256(A##bo, Do); \
ROL64in256(Bga, A##bo, 28); \
XOReq256(A##gu, Du); \
ROL64in256(Bge, A##gu, 20); \
XOReq256(A##ka, Da); \
ROL64in256(Bgi, A##ka, 3); \
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
XOReq256(A##me, De); \
ROL64in256(Bgo, A##me, 45); \
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
XOReq256(A##si, Di); \
ROL64in256(Bgu, A##si, 61); \
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
\
XOReq256(A##be, De); \
ROL64in256(Bka, A##be, 1); \
XOReq256(A##gi, Di); \
ROL64in256(Bke, A##gi, 6); \
XOReq256(A##ko, Do); \
ROL64in256(Bki, A##ko, 25); \
E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
XOReq256(A##mu, Du); \
ROL64in256_8(Bko, A##mu); \
E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
XOReq256(A##sa, Da); \
ROL64in256(Bku, A##sa, 18); \
E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
\
XOReq256(A##bu, Du); \
ROL64in256(Bma, A##bu, 27); \
XOReq256(A##ga, Da); \
ROL64in256(Bme, A##ga, 36); \
XOReq256(A##ke, De); \
ROL64in256(Bmi, A##ke, 10); \
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
XOReq256(A##mi, Di); \
ROL64in256(Bmo, A##mi, 15); \
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
XOReq256(A##so, Do); \
ROL64in256_56(Bmu, A##so); \
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
\
XOReq256(A##bi, Di); \
ROL64in256(Bsa, A##bi, 62); \
XOReq256(A##go, Do); \
ROL64in256(Bse, A##go, 55); \
XOReq256(A##ku, Du); \
ROL64in256(Bsi, A##ku, 39); \
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
XOReq256(A##ma, Da); \
ROL64in256(Bso, A##ma, 41); \
E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
XOReq256(A##se, De); \
ROL64in256(Bsu, A##se, 2); \
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
\
static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
0x0000000000000001ULL,
0x0000000000008082ULL,
0x800000000000808aULL,
0x8000000080008000ULL,
0x000000000000808bULL,
0x0000000080000001ULL,
0x8000000080008081ULL,
0x8000000000008009ULL,
0x000000000000008aULL,
0x0000000000000088ULL,
0x0000000080008009ULL,
0x000000008000000aULL,
0x000000008000808bULL,
0x800000000000008bULL,
0x8000000000008089ULL,
0x8000000000008003ULL,
0x8000000000008002ULL,
0x8000000000000080ULL,
0x000000000000800aULL,
0x800000008000000aULL,
0x8000000080008081ULL,
0x8000000000008080ULL,
0x0000000080000001ULL,
0x8000000080008008ULL};
#define copyFromState(X, state) \
X##ba = LOAD256(state[ 0]); \
X##be = LOAD256(state[ 1]); \
X##bi = LOAD256(state[ 2]); \
X##bo = LOAD256(state[ 3]); \
X##bu = LOAD256(state[ 4]); \
X##ga = LOAD256(state[ 5]); \
X##ge = LOAD256(state[ 6]); \
X##gi = LOAD256(state[ 7]); \
X##go = LOAD256(state[ 8]); \
X##gu = LOAD256(state[ 9]); \
X##ka = LOAD256(state[10]); \
X##ke = LOAD256(state[11]); \
X##ki = LOAD256(state[12]); \
X##ko = LOAD256(state[13]); \
X##ku = LOAD256(state[14]); \
X##ma = LOAD256(state[15]); \
X##me = LOAD256(state[16]); \
X##mi = LOAD256(state[17]); \
X##mo = LOAD256(state[18]); \
X##mu = LOAD256(state[19]); \
X##sa = LOAD256(state[20]); \
X##se = LOAD256(state[21]); \
X##si = LOAD256(state[22]); \
X##so = LOAD256(state[23]); \
X##su = LOAD256(state[24]); \
#define copyToState(state, X) \
STORE256(state[ 0], X##ba); \
STORE256(state[ 1], X##be); \
STORE256(state[ 2], X##bi); \
STORE256(state[ 3], X##bo); \
STORE256(state[ 4], X##bu); \
STORE256(state[ 5], X##ga); \
STORE256(state[ 6], X##ge); \
STORE256(state[ 7], X##gi); \
STORE256(state[ 8], X##go); \
STORE256(state[ 9], X##gu); \
STORE256(state[10], X##ka); \
STORE256(state[11], X##ke); \
STORE256(state[12], X##ki); \
STORE256(state[13], X##ko); \
STORE256(state[14], X##ku); \
STORE256(state[15], X##ma); \
STORE256(state[16], X##me); \
STORE256(state[17], X##mi); \
STORE256(state[18], X##mo); \
STORE256(state[19], X##mu); \
STORE256(state[20], X##sa); \
STORE256(state[21], X##se); \
STORE256(state[22], X##si); \
STORE256(state[23], X##so); \
STORE256(state[24], X##su); \
#ifdef KeccakP1600times4_fullUnrolling
#define FullUnrolling
#else
#define Unrolling KeccakP1600times4_unrolling
#endif
#include "KeccakP-1600-unrolling.macros"
static void KeccakP1600times4_PermuteAll_24rounds(void *states)
{
V256 *statesAsLanes = (V256 *)states;
declareABCDE
#ifndef KeccakP1600times4_fullUnrolling
unsigned int i;
#endif
copyFromState(A, statesAsLanes)
rounds24
copyToState(statesAsLanes, A)
}

View File

@ -0,0 +1,39 @@
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
/* Simplified for liboqs: remove code that wasn't used in shake128_4x */
#ifndef _KeccakP_1600_times4_SnP_h_
#define _KeccakP_1600_times4_SnP_h_
/** For the documentation, see PlSnP-documentation.h.
*/
#include "SIMD256-config.h"
#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
#define KeccakP1600times4_statesSizeInBytes 800
#define KeccakP1600times4_statesAlignment 32
#define KeccakF1600times4_FastLoop_supported
#define KeccakP1600times4_12rounds_FastLoop_supported
#include <stddef.h>
#define KeccakP1600times4_StaticInitialize()
#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
static void KeccakP1600times4_PermuteAll_24rounds(void *states);
#endif

View File

@ -0,0 +1,198 @@
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
#if (defined(FullUnrolling))
#define rounds24 \
prepareTheta \
thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
thetaRhoPiChiIotaPrepareTheta(10, A, E) \
thetaRhoPiChiIotaPrepareTheta(11, E, A) \
thetaRhoPiChiIotaPrepareTheta(12, A, E) \
thetaRhoPiChiIotaPrepareTheta(13, E, A) \
thetaRhoPiChiIotaPrepareTheta(14, A, E) \
thetaRhoPiChiIotaPrepareTheta(15, E, A) \
thetaRhoPiChiIotaPrepareTheta(16, A, E) \
thetaRhoPiChiIotaPrepareTheta(17, E, A) \
thetaRhoPiChiIotaPrepareTheta(18, A, E) \
thetaRhoPiChiIotaPrepareTheta(19, E, A) \
thetaRhoPiChiIotaPrepareTheta(20, A, E) \
thetaRhoPiChiIotaPrepareTheta(21, E, A) \
thetaRhoPiChiIotaPrepareTheta(22, A, E) \
thetaRhoPiChiIota(23, E, A) \
#define rounds12 \
prepareTheta \
thetaRhoPiChiIotaPrepareTheta(12, A, E) \
thetaRhoPiChiIotaPrepareTheta(13, E, A) \
thetaRhoPiChiIotaPrepareTheta(14, A, E) \
thetaRhoPiChiIotaPrepareTheta(15, E, A) \
thetaRhoPiChiIotaPrepareTheta(16, A, E) \
thetaRhoPiChiIotaPrepareTheta(17, E, A) \
thetaRhoPiChiIotaPrepareTheta(18, A, E) \
thetaRhoPiChiIotaPrepareTheta(19, E, A) \
thetaRhoPiChiIotaPrepareTheta(20, A, E) \
thetaRhoPiChiIotaPrepareTheta(21, E, A) \
thetaRhoPiChiIotaPrepareTheta(22, A, E) \
thetaRhoPiChiIota(23, E, A) \
#elif (Unrolling == 12)
#define rounds24 \
prepareTheta \
for(i=0; i<24; i+=12) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
} \
#define rounds12 \
prepareTheta \
thetaRhoPiChiIotaPrepareTheta(12, A, E) \
thetaRhoPiChiIotaPrepareTheta(13, E, A) \
thetaRhoPiChiIotaPrepareTheta(14, A, E) \
thetaRhoPiChiIotaPrepareTheta(15, E, A) \
thetaRhoPiChiIotaPrepareTheta(16, A, E) \
thetaRhoPiChiIotaPrepareTheta(17, E, A) \
thetaRhoPiChiIotaPrepareTheta(18, A, E) \
thetaRhoPiChiIotaPrepareTheta(19, E, A) \
thetaRhoPiChiIotaPrepareTheta(20, A, E) \
thetaRhoPiChiIotaPrepareTheta(21, E, A) \
thetaRhoPiChiIotaPrepareTheta(22, A, E) \
thetaRhoPiChiIota(23, E, A) \
#elif (Unrolling == 6)
#define rounds24 \
prepareTheta \
for(i=0; i<24; i+=6) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
} \
#define rounds12 \
prepareTheta \
for(i=12; i<24; i+=6) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
} \
#elif (Unrolling == 4)
#define rounds24 \
prepareTheta \
for(i=0; i<24; i+=4) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
} \
#define rounds12 \
prepareTheta \
for(i=12; i<24; i+=4) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
} \
#elif (Unrolling == 3)
#define rounds24 \
prepareTheta \
for(i=0; i<24; i+=3) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
copyStateVariables(A, E) \
} \
#define rounds12 \
prepareTheta \
for(i=12; i<24; i+=3) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
copyStateVariables(A, E) \
} \
#elif (Unrolling == 2)
#define rounds24 \
prepareTheta \
for(i=0; i<24; i+=2) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
} \
#define rounds12 \
prepareTheta \
for(i=12; i<24; i+=2) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
} \
#elif (Unrolling == 1)
#define rounds24 \
prepareTheta \
for(i=0; i<24; i++) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
copyStateVariables(A, E) \
} \
#define rounds12 \
prepareTheta \
for(i=12; i<24; i++) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
copyStateVariables(A, E) \
} \
#else
#error "Unrolling is not correctly specified!"
#endif
#define roundsN(__nrounds) \
prepareTheta \
i = 24 - (__nrounds); \
if ((i&1) != 0) { \
thetaRhoPiChiIotaPrepareTheta(i, A, E) \
copyStateVariables(A, E) \
++i; \
} \
for( /* empty */; i<24; i+=2) { \
thetaRhoPiChiIotaPrepareTheta(i , A, E) \
thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
}

View File

@ -0,0 +1,3 @@
#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
#define KeccakP1600times4_fullUnrolling
#define KeccakP1600times4_useAVX2

View File

@ -0,0 +1,34 @@
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
#ifndef _align_h_
#define _align_h_
/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
#ifdef ALIGN
#undef ALIGN
#endif
#if defined(__GNUC__)
#define ALIGN(x) __attribute__ ((aligned(x)))
#elif defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#elif defined(__ARMCC_VERSION)
#define ALIGN(x) __align(x)
#else
#define ALIGN(x)
#endif
#endif

View File

@ -0,0 +1,142 @@
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
LICENSE TERMS
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
1. source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
2. binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation;
3. the name of the copyright holder is not used to endorse products
built using this software without specific written permission.
DISCLAIMER
This software is provided 'as is' with no explicit or implied warranties
in respect of its properties, including, but not limited to, correctness
and/or fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
Changes for ARM 9/9/2010
*/
#ifndef _BRG_ENDIAN_H
#define _BRG_ENDIAN_H
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
#if 0
/* Include files where endian defines and byteswap functions may reside */
#if defined( __sun )
# include <sys/isa_defs.h>
#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
# include <sys/endian.h>
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
# include <machine/endian.h>
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
# if !defined( __MINGW32__ ) && !defined( _AIX )
# include <endian.h>
# if !defined( __BEOS__ )
# include <byteswap.h>
# endif
# endif
#endif
#endif
/* Now attempt to set the define for platform byte order using any */
/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
/* seem to encompass most endian symbol definitions */
#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( _BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( _LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
/* if the platform byte order could not be determined, then try to */
/* set this define using common machine defines */
#if !defined(PLATFORM_BYTE_ORDER)
#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \
defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \
defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \
defined( vax ) || defined( vms ) || defined( VMS ) || \
defined( __VMS ) || defined( _M_X64 )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \
defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \
defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \
defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \
defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \
defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \
defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(__arm__)
# ifdef __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# else
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif 1 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#else
# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
#endif
#endif
#endif

View File

@ -499,6 +499,28 @@ void OQS_SHA3_cshake128_simple(uint8_t *output, size_t outlen, uint16_t cstm, co
*/
void OQS_SHA3_cshake256_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *input, size_t inplen);
#ifdef USE_AVX2_INSTRUCTIONS
/**
* \brief Seed 4 parallel SHAKE-128 instances, and generate 4 arrays of pseudo-random bytes.
*
* Uses a vectorized (AVX2) implementation of SHAKE-128.
*
* \warning The output array length must not be zero.
*
* \param output0 The first output byte array
* \param output0 The second output byte array
* \param output0 The third output byte array
* \param output0 The fourth output byte array
* \param outlen The number of output bytes to generate in every output array
* \param in0 The first input seed byte array
* \param in1 The second input seed byte array
* \param in2 The third input seed byte array
* \param in3 The fourth input seed byte array
* \param inlen The number of seed bytes to process from every input array
*/
void OQS_SHA3_shake128_4x(uint8_t *output0, uint8_t *output1, uint8_t *output2, uint8_t *output3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen);
#endif
#if defined(__cplusplus)
} // extern "C"
#endif

170
src/crypto/sha3/sha3x4.c Executable file
View File

@ -0,0 +1,170 @@
#include <immintrin.h>
#include <stdint.h>
#include <assert.h>
#define SHAKE128_RATE 168
#define NROUNDS 24
#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
static uint64_t load64(const unsigned char *x)
{
unsigned long long r = 0, i;
for (i = 0; i < 8; ++i) {
r |= (unsigned long long)x[i] << 8 * i;
}
return r;
}
static void store64(uint8_t *x, uint64_t u)
{
unsigned int i;
for (i = 0; i < 8; ++i) {
x[i] = u;
u >>= 8;
}
}
/* Use implementation from the Keccak Code Package */
#include "keccak4x/KeccakP-1600-times4-SIMD256.c"
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
static void keccak_absorb4x(__m256i *s, unsigned int r, const unsigned char *m0, const unsigned char *m1, const unsigned char *m2, const unsigned char *m3,
unsigned long long int mlen, unsigned char p)
{
unsigned long long i;
unsigned char t0[200];
unsigned char t1[200];
unsigned char t2[200];
unsigned char t3[200];
unsigned long long *ss = (unsigned long long *)s;
while (mlen >= r)
{
for (i = 0; i < r / 8; ++i)
{
ss[4*i+0] ^= load64(m0 + 8 * i);
ss[4*i+1] ^= load64(m1 + 8 * i);
ss[4*i+2] ^= load64(m2 + 8 * i);
ss[4*i+3] ^= load64(m3 + 8 * i);
}
KeccakF1600_StatePermute4x(s);
mlen -= r;
m0 += r;
m1 += r;
m2 += r;
m3 += r;
}
for (i = 0; i < r; ++i)
{
t0[i] = 0;
t1[i] = 0;
t2[i] = 0;
t3[i] = 0;
}
for (i = 0; i < mlen; ++i)
{
t0[i] = m0[i];
t1[i] = m1[i];
t2[i] = m2[i];
t3[i] = m3[i];
}
t0[i] = p;
t1[i] = p;
t2[i] = p;
t3[i] = p;
t0[r - 1] |= 128;
t1[r - 1] |= 128;
t2[r - 1] |= 128;
t3[r - 1] |= 128;
for (i = 0; i < r / 8; ++i)
{
ss[4*i+0] ^= load64(t0 + 8 * i);
ss[4*i+1] ^= load64(t1 + 8 * i);
ss[4*i+2] ^= load64(t2 + 8 * i);
ss[4*i+3] ^= load64(t3 + 8 * i);
}
}
static void keccak_squeezeblocks4x(unsigned char *h0, unsigned char *h1, unsigned char *h2, unsigned char *h3, unsigned long long int nblocks, __m256i *s, unsigned int r)
{
unsigned int i;
unsigned long long *ss = (unsigned long long *)s;
while (nblocks > 0)
{
KeccakF1600_StatePermute4x(s);
for (i = 0; i < (r>>3); i++)
{
store64(h0+8*i, ss[4*i+0]);
store64(h1+8*i, ss[4*i+1]);
store64(h2+8*i, ss[4*i+2]);
store64(h3+8*i, ss[4*i+3]);
}
h0 += r;
h1 += r;
h2 += r;
h3 += r;
nblocks--;
}
}
/********** SHAKE128 ***********/
static void shake128_absorb4x(__m256i *s, const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen)
{
unsigned int i;
for (i = 0; i < 25; i++)
s[i] = _mm256_xor_si256(s[i], s[i]); // zero state
/* Absorb input */
keccak_absorb4x(s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}
/* N is assumed to be empty; S is assumed to have at most 2 characters */
void OQS_SHA3_shake128_4x(unsigned char *output0, unsigned char *output1, unsigned char *output2, unsigned char *output3, unsigned long long outlen,
const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen)
{
__m256i s[25];
unsigned char t0[SHAKE128_RATE];
unsigned char t1[SHAKE128_RATE];
unsigned char t2[SHAKE128_RATE];
unsigned char t3[SHAKE128_RATE];
unsigned int i;
shake128_absorb4x(s, in0, in1, in2, in3, inlen);
/* Squeeze output */
keccak_squeezeblocks4x(output0, output1, output2, output3, outlen/SHAKE128_RATE, s, SHAKE128_RATE);
output0 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
output1 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
output2 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
output3 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
if (outlen%SHAKE128_RATE)
{
keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE128_RATE);
for (i = 0; i < outlen%SHAKE128_RATE; i++)
{
output0[i] = t0[i];
output1[i] = t1[i];
output2[i] = t2[i];
output3[i] = t3[i];
}
}
}

View File

@ -50,7 +50,8 @@ int frodo_mul_add_as_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
}
OQS_AES128_ECB_enc_sch((uint8_t*)a_row_temp, 4*PARAMS_N*sizeof(int16_t), aes_key_schedule, (uint8_t*)a_row);
#elif defined (USE_SHAKE128_FOR_A)
#elif defined (USE_SHAKE128_FOR_A)
#ifndef USE_AVX2_INSTRUCTIONS
uint8_t seed_A_separated[2 + BYTES_SEED_A];
uint16_t* seed_A_origin = (uint16_t*)&seed_A_separated;
memcpy(&seed_A_separated[2], seed_A, BYTES_SEED_A);
@ -63,6 +64,27 @@ int frodo_mul_add_as_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
OQS_SHA3_shake128((unsigned char*)(a_row + 2*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A);
seed_A_origin[0] = UINT16_TO_LE(i + 3);
OQS_SHA3_shake128((unsigned char*)(a_row + 3*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A);
#else
uint8_t seed_A_separated_0[2 + BYTES_SEED_A];
uint8_t seed_A_separated_1[2 + BYTES_SEED_A];
uint8_t seed_A_separated_2[2 + BYTES_SEED_A];
uint8_t seed_A_separated_3[2 + BYTES_SEED_A];
uint16_t* seed_A_origin_0 = (uint16_t*)&seed_A_separated_0;
uint16_t* seed_A_origin_1 = (uint16_t*)&seed_A_separated_1;
uint16_t* seed_A_origin_2 = (uint16_t*)&seed_A_separated_2;
uint16_t* seed_A_origin_3 = (uint16_t*)&seed_A_separated_3;
memcpy(&seed_A_separated_0[2], seed_A, BYTES_SEED_A);
memcpy(&seed_A_separated_1[2], seed_A, BYTES_SEED_A);
memcpy(&seed_A_separated_2[2], seed_A, BYTES_SEED_A);
memcpy(&seed_A_separated_3[2], seed_A, BYTES_SEED_A);
for (i = 0; i < PARAMS_N; i += 4) {
seed_A_origin_0[0] = UINT16_TO_LE(i + 0);
seed_A_origin_1[0] = UINT16_TO_LE(i + 1);
seed_A_origin_2[0] = UINT16_TO_LE(i + 2);
seed_A_origin_3[0] = UINT16_TO_LE(i + 3);
OQS_SHA3_shake128_4x((unsigned char*)(a_row), (unsigned char*)(a_row + PARAMS_N), (unsigned char*)(a_row + 2*PARAMS_N), (unsigned char*)(a_row + 3*PARAMS_N),
(unsigned long long)(2*PARAMS_N), seed_A_separated_0, seed_A_separated_1, seed_A_separated_2, seed_A_separated_3, 2 + BYTES_SEED_A);
#endif
#endif
for (k = 0; k < 4 * PARAMS_N; k++) {
a_row[k] = LE_TO_UINT16(a_row[k]);
@ -184,6 +206,7 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
int t=0;
ALIGN_HEADER(32) uint16_t a_cols[4*PARAMS_N] ALIGN_FOOTER(32) = {0};
#ifndef USE_AVX2_INSTRUCTIONS
int k;
uint8_t seed_A_separated[2 + BYTES_SEED_A];
uint16_t* seed_A_origin = (uint16_t*)&seed_A_separated;
@ -201,7 +224,6 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
a_cols[i] = LE_TO_UINT16(a_cols[i]);
}
#ifndef USE_AVX2_INSTRUCTIONS
for (i = 0; i < PARAMS_NBAR; i++) {
uint16_t sum[PARAMS_N] = {0};
for (j = 0; j < 4; j++) {
@ -215,6 +237,25 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
}
}
#else
uint8_t seed_A_separated_0[2 + BYTES_SEED_A];
uint8_t seed_A_separated_1[2 + BYTES_SEED_A];
uint8_t seed_A_separated_2[2 + BYTES_SEED_A];
uint8_t seed_A_separated_3[2 + BYTES_SEED_A];
uint16_t* seed_A_origin_0 = (uint16_t*)&seed_A_separated_0;
uint16_t* seed_A_origin_1 = (uint16_t*)&seed_A_separated_1;
uint16_t* seed_A_origin_2 = (uint16_t*)&seed_A_separated_2;
uint16_t* seed_A_origin_3 = (uint16_t*)&seed_A_separated_3;
memcpy(&seed_A_separated_0[2], seed_A, BYTES_SEED_A);
memcpy(&seed_A_separated_1[2], seed_A, BYTES_SEED_A);
memcpy(&seed_A_separated_2[2], seed_A, BYTES_SEED_A);
memcpy(&seed_A_separated_3[2], seed_A, BYTES_SEED_A);
for (kk = 0; kk < PARAMS_N; kk+=4) {
seed_A_origin_0[0] = UINT16_TO_LE(kk + 0);
seed_A_origin_1[0] = UINT16_TO_LE(kk + 1);
seed_A_origin_2[0] = UINT16_TO_LE(kk + 2);
seed_A_origin_3[0] = UINT16_TO_LE(kk + 3);
OQS_SHA3_shake128_4x((unsigned char*)(a_cols), (unsigned char*)(a_cols + PARAMS_N), (unsigned char*)(a_cols + 2*PARAMS_N), (unsigned char*)(a_cols + 3*PARAMS_N),
(unsigned long long)(2*PARAMS_N), seed_A_separated_0, seed_A_separated_1, seed_A_separated_2, seed_A_separated_3, 2 + BYTES_SEED_A);
for (i = 0; i < PARAMS_NBAR; i++) {
__m256i a, b0, b1, b2, b3, acc[PARAMS_N/16];
b0 = _mm256_set1_epi16(s[i*PARAMS_N + kk + 0]);
@ -237,8 +278,8 @@ int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e,
acc[j/16] = _mm256_add_epi16(a, acc[j/16]);
}
for (k = 0; k < PARAMS_N/16; k++) {
_mm256_store_si256((__m256i*)&out[i*PARAMS_N + 16*k], acc[k]);
for (j = 0; j < PARAMS_N/16; j++) {
_mm256_store_si256((__m256i*)&out[i*PARAMS_N + 16*j], acc[j]);
}
}
#endif