diff --git a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202.c b/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202.c deleted file mode 100644 index ab3d2a121..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202.c +++ /dev/null @@ -1,774 +0,0 @@ -/* Based on the public domain implementation in crypto_hash/keccakc512/simple/ from - * http://bench.cr.yp.to/supercop.html by Ronny Van Keer and the public domain "TweetFips202" - * implementation from https://twitter.com/tweetfips202 by Gilles Van Assche, Daniel J. Bernstein, - * and Peter Schwabe */ - -#include -#include -#include "fips202.h" - -#define NROUNDS 24 -#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) - -/************************************************* -* Name: load64 -* -* Description: Load 8 bytes into uint64_t in little-endian order -* -* Arguments: - const uint8_t *x: pointer to input byte array -* -* Returns the loaded 64-bit unsigned integer -**************************************************/ -static uint64_t load64(const uint8_t x[8]) { - unsigned int i; - uint64_t r = 0; - - for(i=0;i<8;i++) - r |= (uint64_t)x[i] << 8*i; - - return r; -} - -/************************************************* -* Name: store64 -* -* Description: Store a 64-bit integer to array of 8 bytes in little-endian order -* -* Arguments: - uint8_t *x: pointer to the output byte array (allocated) -* - uint64_t u: input 64-bit unsigned integer -**************************************************/ -static void store64(uint8_t x[8], uint64_t u) { - unsigned int i; - - for(i=0;i<8;i++) - x[i] = u >> 8*i; -} - -/* Keccak round constants */ -static const uint64_t KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermute -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -static void KeccakF1600_StatePermute(uint64_t state[25]) -{ - int round; - - uint64_t Aba, Abe, Abi, Abo, Abu; - uint64_t Aga, Age, Agi, Ago, Agu; - uint64_t Aka, Ake, Aki, Ako, Aku; - uint64_t Ama, Ame, Ami, Amo, Amu; - uint64_t Asa, Ase, Asi, Aso, Asu; - uint64_t BCa, BCe, BCi, BCo, BCu; - uint64_t Da, De, Di, Do, Du; - uint64_t Eba, Ebe, Ebi, Ebo, Ebu; - uint64_t Ega, Ege, Egi, Ego, Egu; - uint64_t Eka, Eke, Eki, Eko, Eku; - uint64_t Ema, Eme, Emi, Emo, Emu; - uint64_t Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[ 0]; - Abe = state[ 1]; - Abi = state[ 2]; - Abo = state[ 3]; - Abu = state[ 4]; - Aga = state[ 5]; - Age = state[ 6]; - Agi = state[ 7]; - Ago = state[ 8]; - Agu = state[ 9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for(round = 0; round < NROUNDS; round += 2) { - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round, A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= (uint64_t)KeccakF_RoundConstants[round]; - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - } - - //copyToState(state, A) - state[ 0] = Aba; - state[ 1] = Abe; - state[ 2] = Abi; - state[ 3] = Abo; - state[ 4] = Abu; - state[ 5] = Aga; - state[ 6] = Age; - state[ 7] = Agi; - state[ 8] = Ago; - state[ 9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; -} - -/************************************************* -* Name: keccak_init -* -* Description: Initializes the Keccak state. -* -* Arguments: - uint64_t *s: pointer to Keccak state -**************************************************/ -static void keccak_init(uint64_t s[25]) -{ - unsigned int i; - for(i=0;i<25;i++) - s[i] = 0; -} - -/************************************************* -* Name: keccak_absorb -* -* Description: Absorb step of Keccak; incremental. -* -* Arguments: - uint64_t *s: pointer to Keccak state -* - unsigned int pos: position in current block to be absorbed -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -* -* Returns new position pos in current block -**************************************************/ -static unsigned int keccak_absorb(uint64_t s[25], - unsigned int pos, - unsigned int r, - const uint8_t *in, - size_t inlen) -{ - unsigned int i; - - while(pos+inlen >= r) { - for(i=pos;i> 8*(i%8); - outlen -= i-pos; - pos = i; - } - - return pos; -} - - -/************************************************* -* Name: keccak_absorb_once -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -* - uint8_t p: domain-separation byte for different Keccak-derived functions -**************************************************/ -static void keccak_absorb_once(uint64_t s[25], - unsigned int r, - const uint8_t *in, - size_t inlen, - uint8_t p) -{ - unsigned int i; - - for(i=0;i<25;i++) - s[i] = 0; - - while(inlen >= r) { - for(i=0;is); - state->pos = 0; -} - -/************************************************* -* Name: shake128_absorb -* -* Description: Absorb step of the SHAKE128 XOF; incremental. -* -* Arguments: - keccak_state *state: pointer to (initialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen) -{ - state->pos = keccak_absorb(state->s, state->pos, SHAKE128_RATE, in, inlen); -} - -/************************************************* -* Name: shake128_finalize -* -* Description: Finalize absorb step of the SHAKE128 XOF. -* -* Arguments: - keccak_state *state: pointer to Keccak state -**************************************************/ -void shake128_finalize(keccak_state *state) -{ - keccak_finalize(state->s, state->pos, SHAKE128_RATE, 0x1F); - state->pos = SHAKE128_RATE; -} - -/************************************************* -* Name: shake128_squeeze -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes arbitraily many -* bytes. Can be called multiple times to keep squeezing. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t outlen : number of bytes to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state) -{ - state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE128_RATE); -} - -/************************************************* -* Name: shake128_absorb_once -* -* Description: Initialize, absorb into and finalize SHAKE128 XOF; non-incremental. -* -* Arguments: - keccak_state *state: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen) -{ - keccak_absorb_once(state->s, SHAKE128_RATE, in, inlen, 0x1F); - state->pos = SHAKE128_RATE; -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Can be called multiple times -* to keep squeezing. Assumes new block has not yet been -* started (state->pos = SHAKE128_RATE). -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state) -{ - keccak_squeezeblocks(out, nblocks, state->s, SHAKE128_RATE); -} - -/************************************************* -* Name: shake256_init -* -* Description: Initilizes Keccak state for use as SHAKE256 XOF -* -* Arguments: - keccak_state *state: pointer to (uninitialized) Keccak state -**************************************************/ -void shake256_init(keccak_state *state) -{ - keccak_init(state->s); - state->pos = 0; -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF; incremental. -* -* Arguments: - keccak_state *state: pointer to (initialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen) -{ - state->pos = keccak_absorb(state->s, state->pos, SHAKE256_RATE, in, inlen); -} - -/************************************************* -* Name: shake256_finalize -* -* Description: Finalize absorb step of the SHAKE256 XOF. -* -* Arguments: - keccak_state *state: pointer to Keccak state -**************************************************/ -void shake256_finalize(keccak_state *state) -{ - keccak_finalize(state->s, state->pos, SHAKE256_RATE, 0x1F); - state->pos = SHAKE256_RATE; -} - -/************************************************* -* Name: shake256_squeeze -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes arbitraily many -* bytes. Can be called multiple times to keep squeezing. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t outlen : number of bytes to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state) -{ - state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE256_RATE); -} - -/************************************************* -* Name: shake256_absorb_once -* -* Description: Initialize, absorb into and finalize SHAKE256 XOF; non-incremental. -* -* Arguments: - keccak_state *state: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen) -{ - keccak_absorb_once(state->s, SHAKE256_RATE, in, inlen, 0x1F); - state->pos = SHAKE256_RATE; -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Can be called multiple times -* to keep squeezing. Assumes next block has not yet been -* started (state->pos = SHAKE256_RATE). -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state) -{ - keccak_squeezeblocks(out, nblocks, state->s, SHAKE256_RATE); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen) -{ - size_t nblocks; - keccak_state state; - - shake128_absorb_once(&state, in, inlen); - nblocks = outlen/SHAKE128_RATE; - shake128_squeezeblocks(out, nblocks, &state); - outlen -= nblocks*SHAKE128_RATE; - out += nblocks*SHAKE128_RATE; - shake128_squeeze(out, outlen, &state); -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen) -{ - size_t nblocks; - keccak_state state; - - shake256_absorb_once(&state, in, inlen); - nblocks = outlen/SHAKE256_RATE; - shake256_squeezeblocks(out, nblocks, &state); - outlen -= nblocks*SHAKE256_RATE; - out += nblocks*SHAKE256_RATE; - shake256_squeeze(out, outlen, &state); -} - -/************************************************* -* Name: sha3_256 -* -* Description: SHA3-256 with non-incremental API -* -* Arguments: - uint8_t *h: pointer to output (32 bytes) -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen) -{ - unsigned int i; - uint64_t s[25]; - - keccak_absorb_once(s, SHA3_256_RATE, in, inlen, 0x06); - KeccakF1600_StatePermute(s); - for(i=0;i<4;i++) - store64(h+8*i,s[i]); -} - -/************************************************* -* Name: sha3_512 -* -* Description: SHA3-512 with non-incremental API -* -* Arguments: - uint8_t *h: pointer to output (64 bytes) -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen) -{ - unsigned int i; - uint64_t s[25]; - - keccak_absorb_once(s, SHA3_512_RATE, in, inlen, 0x06); - KeccakF1600_StatePermute(s); - for(i=0;i<8;i++) - store64(h+8*i,s[i]); -} diff --git a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202.h b/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202.h deleted file mode 100644 index 36689e3d0..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef FIPS202_H -#define FIPS202_H - -#include -#include - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -#define FIPS202_NAMESPACE(s) pqcrystals_kyber_fips202_avx2_##s - -typedef struct { - uint64_t s[25]; - unsigned int pos; -} keccak_state; - -#define shake128_init FIPS202_NAMESPACE(shake128_init) -void shake128_init(keccak_state *state); -#define shake128_absorb FIPS202_NAMESPACE(shake128_absorb) -void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake128_finalize FIPS202_NAMESPACE(shake128_finalize) -void shake128_finalize(keccak_state *state); -#define shake128_squeeze FIPS202_NAMESPACE(shake128_squeeze) -void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state); -#define shake128_absorb_once FIPS202_NAMESPACE(shake128_absorb_once) -void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake128_squeezeblocks FIPS202_NAMESPACE(shake128_squeezeblocks) -void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state); - -#define shake256_init FIPS202_NAMESPACE(shake256_init) -void shake256_init(keccak_state *state); -#define shake256_absorb FIPS202_NAMESPACE(shake256_absorb) -void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake256_finalize FIPS202_NAMESPACE(shake256_finalize) -void shake256_finalize(keccak_state *state); -#define shake256_squeeze FIPS202_NAMESPACE(shake256_squeeze) -void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state); -#define shake256_absorb_once FIPS202_NAMESPACE(shake256_absorb_once) -void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake256_squeezeblocks FIPS202_NAMESPACE(shake256_squeezeblocks) -void shake256_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state); - -#define shake128 FIPS202_NAMESPACE(shake128) -void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen); -#define shake256 FIPS202_NAMESPACE(shake256) -void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen); -#define sha3_256 FIPS202_NAMESPACE(sha3_256) -void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen); -#define sha3_512 FIPS202_NAMESPACE(sha3_512) -void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen); - -#endif diff --git a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202x4.c b/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202x4.c deleted file mode 100644 index 6bc6d1a5c..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202x4.c +++ /dev/null @@ -1,200 +0,0 @@ -#include -#include -#include -#include -#include "fips202.h" -#include "fips202x4.h" - -/* Use implementation from the Keccak Code Package */ -#define KeccakF1600_StatePermute4x FIPS202X4_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) -extern void KeccakF1600_StatePermute4x(__m256i *s); - -static void keccakx4_absorb_once(__m256i s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen, - uint8_t p) -{ - size_t i; - uint64_t pos = 0; - __m256i t, idx; - - for(i = 0; i < 25; ++i) - s[i] = _mm256_setzero_si256(); - - idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); - while(inlen >= r) { - for(i = 0; i < r/8; ++i) { - t = _mm256_i64gather_epi64((long long *)pos, idx, 1); - s[i] = _mm256_xor_si256(s[i], t); - pos += 8; - } - inlen -= r; - - KeccakF1600_StatePermute4x(s); - } - - for(i = 0; i < inlen/8; ++i) { - t = _mm256_i64gather_epi64((long long *)pos, idx, 1); - s[i] = _mm256_xor_si256(s[i], t); - pos += 8; - } - inlen -= 8*i; - - if(inlen) { - t = _mm256_i64gather_epi64((long long *)pos, idx, 1); - idx = _mm256_set1_epi64x((1ULL << (8*inlen)) - 1); - t = _mm256_and_si256(t, idx); - s[i] = _mm256_xor_si256(s[i], t); - } - - t = _mm256_set1_epi64x((uint64_t)p << 8*inlen); - s[i] = _mm256_xor_si256(s[i], t); - t = _mm256_set1_epi64x(1ULL << 63); - s[r/8 - 1] = _mm256_xor_si256(s[r/8 - 1], t); -} - -static void keccakx4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - unsigned int r, - __m256i s[25]) -{ - unsigned int i; - __m128d t; - - while(nblocks > 0) { - KeccakF1600_StatePermute4x(s); - for(i=0; i < r/8; ++i) { - t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); - _mm_storel_pd((__attribute__((__may_alias__)) double *)&out0[8*i], t); - _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out1[8*i], t); - t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1)); - _mm_storel_pd((__attribute__((__may_alias__)) double *)&out2[8*i], t); - _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out3[8*i], t); - } - - out0 += r; - out1 += r; - out2 += r; - out3 += r; - --nblocks; - } -} - -void shake128x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); -} - -void shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state) -{ - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); -} - -void shake256x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); -} - -void shake256x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state) -{ - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); -} - -void shake128x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - unsigned int i; - size_t nblocks = outlen/SHAKE128_RATE; - uint8_t t[4][SHAKE128_RATE]; - keccakx4_state state; - - shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); - shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - - out0 += nblocks*SHAKE128_RATE; - out1 += nblocks*SHAKE128_RATE; - out2 += nblocks*SHAKE128_RATE; - out3 += nblocks*SHAKE128_RATE; - outlen -= nblocks*SHAKE128_RATE; - - if(outlen) { - shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); - for(i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - out2[i] = t[2][i]; - out3[i] = t[3][i]; - } - } -} - -void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - unsigned int i; - size_t nblocks = outlen/SHAKE256_RATE; - uint8_t t[4][SHAKE256_RATE]; - keccakx4_state state; - - shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); - shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - - out0 += nblocks*SHAKE256_RATE; - out1 += nblocks*SHAKE256_RATE; - out2 += nblocks*SHAKE256_RATE; - out3 += nblocks*SHAKE256_RATE; - outlen -= nblocks*SHAKE256_RATE; - - if(outlen) { - shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); - for(i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - out2[i] = t[2][i]; - out3[i] = t[3][i]; - } - } -} diff --git a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202x4.h b/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202x4.h deleted file mode 100644 index f2121f3b3..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_avx2/fips202x4.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef FIPS202X4_H -#define FIPS202X4_H - -#include -#include -#include - -#define FIPS202X4_NAMESPACE(s) pqcrystals_kyber_fips202x4_avx2_##s - -typedef struct { - __m256i s[25]; -} keccakx4_state; - -#define shake128x4_absorb_once FIPS202X4_NAMESPACE(shake128x4_absorb_once) -void shake128x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake128x4_squeezeblocks FIPS202X4_NAMESPACE(shake128x4_squeezeblocks) -void shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state); - -#define shake256x4_absorb_once FIPS202X4_NAMESPACE(shake256x4_absorb_once) -void shake256x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake256x4_squeezeblocks FIPS202X4_NAMESPACE(shake256x4_squeezeblocks) -void shake256x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state); - -#define shake128x4 FIPS202X4_NAMESPACE(shake128x4) -void shake128x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake256x4 FIPS202X4_NAMESPACE(shake256x4) -void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#endif diff --git a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/fips202x4.h b/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/fips202x4.h deleted file mode 100644 index f2121f3b3..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/fips202x4.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef FIPS202X4_H -#define FIPS202X4_H - -#include -#include -#include - -#define FIPS202X4_NAMESPACE(s) pqcrystals_kyber_fips202x4_avx2_##s - -typedef struct { - __m256i s[25]; -} keccakx4_state; - -#define shake128x4_absorb_once FIPS202X4_NAMESPACE(shake128x4_absorb_once) -void shake128x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake128x4_squeezeblocks FIPS202X4_NAMESPACE(shake128x4_squeezeblocks) -void shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state); - -#define shake256x4_absorb_once FIPS202X4_NAMESPACE(shake256x4_absorb_once) -void shake256x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake256x4_squeezeblocks FIPS202X4_NAMESPACE(shake256x4_squeezeblocks) -void shake256x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state); - -#define shake128x4 FIPS202X4_NAMESPACE(shake128x4) -void shake128x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake256x4 FIPS202X4_NAMESPACE(shake256x4) -void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#endif diff --git a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-times4-SIMD256.c b/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-times4-SIMD256.c deleted file mode 100644 index 54c4a1ee5..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-times4-SIMD256.c +++ /dev/null @@ -1,1030 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaƫl Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#include -#include -#include -#include -#include -#include -#include -#include "KeccakP-align.h" -#include "KeccakP-1600-times4-SnP.h" -#include "KeccakP-SIMD256-config.h" - -#include "KeccakP-brg_endian.h" -#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) -#error Expecting a little-endian platform -#endif - -typedef unsigned char UINT8; -typedef unsigned long long int UINT64; -typedef __m128i V128; -typedef __m256i V256; - -#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex) - -#if defined(KeccakP1600times4_useAVX2) - #define ANDnu256(a, b) _mm256_andnot_si256(a, b) - #define CONST256(a) _mm256_load_si256((const V256 *)&(a)) - #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a)) - #define LOAD256(a) _mm256_load_si256((const V256 *)&(a)) - #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a)) - #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d)) - #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) - #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) - #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) -static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; -static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; - #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b) - #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b) - #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v) - #define XOR256(a, b) _mm256_xor_si256(a, b) - #define XOReq256(a, b) a = _mm256_xor_si256(a, b) - #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) - #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) - #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) - #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) - - #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \ - lanesH01 = UNPACKH( lanes0, lanes1 ), \ - lanesL23 = UNPACKL( lanes2, lanes3 ), \ - lanesH23 = UNPACKH( lanes2, lanes3 ), \ - lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \ - lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \ - lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \ - lanes3 = PERM128( lanesH01, lanesH23, 0x31 ) - - #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \ - lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \ - lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \ - lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \ - lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \ - lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \ - lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \ - lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F ) - -#endif - -#define SnP_laneLengthInBytes 8 - -void KeccakP1600times4_InitializeAll(void *states) -{ - memset(states, 0, KeccakP1600times4_statesSizeInBytes); -} - -void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length) -{ - unsigned int sizeLeft = length; - unsigned int lanePosition = offset/SnP_laneLengthInBytes; - unsigned int offsetInLane = offset%SnP_laneLengthInBytes; - const unsigned char *curData = data; - UINT64 *statesAsLanes = (UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - UINT64 lane = 0; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - UINT64 lane = *((const UINT64*)curData); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - UINT64 lane = 0; - memcpy(&lane, curData, sizeLeft); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - } -} - -void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) -{ - V256 *stateAsLanes = (V256 *)states; - unsigned int i; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - - #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - - #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ - XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ - XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ - XOReq256( stateAsLanes[argIndex+3], lanes3 ) - - if ( laneCount >= 16 ) { - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - if ( laneCount >= 20 ) { - Xor_In4( 16 ); - for(i=20; i 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane); - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - UINT64 lane = *((const UINT64*)curData); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft); - } -} - -void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) -{ - V256 *stateAsLanes = (V256 *)states; - unsigned int i; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - - #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - - #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - STORE256( stateAsLanes[argIndex+0], lanes0 ),\ - STORE256( stateAsLanes[argIndex+1], lanes1 ),\ - STORE256( stateAsLanes[argIndex+2], lanes2 ),\ - STORE256( stateAsLanes[argIndex+3], lanes3 ) - - if ( laneCount >= 16 ) { - OverWr4( 0 ); - OverWr4( 4 ); - OverWr4( 8 ); - OverWr4( 12 ); - if ( laneCount >= 20 ) { - OverWr4( 16 ); - for(i=20; i= SnP_laneLengthInBytes) { - statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - } - - if (sizeLeft > 0) { - memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft); - } -} - -void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length) -{ - unsigned int sizeLeft = length; - unsigned int lanePosition = offset/SnP_laneLengthInBytes; - unsigned int offsetInLane = offset%SnP_laneLengthInBytes; - unsigned char *curData = data; - const UINT64 *statesAsLanes = (const UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane); - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft); - } -} - -void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset) -{ - UINT64 *curData0 = (UINT64 *)data; - UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes); - UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); - UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); - - const V256 *stateAsLanes = (const V256 *)states; - const UINT64 *stateAsLanes64 = (const UINT64*)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - unsigned int i; - - #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \ - curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \ - curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \ - curData3[argIndex] = stateAsLanes64[4*(argIndex)+3] - - #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \ - lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \ - lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \ - lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \ - UNINTLEAVE(), \ - STORE256u( curData0[argIndex], lanes0 ), \ - STORE256u( curData1[argIndex], lanes1 ), \ - STORE256u( curData2[argIndex], lanes2 ), \ - STORE256u( curData3[argIndex], lanes3 ) - - if ( laneCount >= 16 ) { - Extr4( 0 ); - Extr4( 4 ); - Extr4( 8 ); - Extr4( 12 ); - if ( laneCount >= 20 ) { - Extr4( 16 ); - for(i=20; i 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane); - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - sizeLeft -= bytesInLane; - do { - *(curOutput++) = *(curInput++) ^ (unsigned char)lane; - lane >>= 8; - } while ( --bytesInLane != 0); - lanePosition++; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curInput += SnP_laneLengthInBytes; - curOutput += SnP_laneLengthInBytes; - } - - if (sizeLeft != 0) { - UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - do { - *(curOutput++) = *(curInput++) ^ (unsigned char)lane; - lane >>= 8; - } while ( --sizeLeft != 0); - } -} - -void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset) -{ - const UINT64 *curInput0 = (UINT64 *)input; - const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes); - const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes); - const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes); - UINT64 *curOutput0 = (UINT64 *)output; - UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes); - UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes); - UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes); - - const V256 *stateAsLanes = (const V256 *)states; - const UINT64 *stateAsLanes64 = (const UINT64*)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - unsigned int i; - - #define ExtrXor( argIndex ) \ - curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\ - curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\ - curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\ - curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3] - - #define ExtrXor4( argIndex ) \ - lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\ - lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\ - lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\ - lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\ - UNINTLEAVE(),\ - lanesL01 = LOAD256u( curInput0[argIndex]),\ - lanesH01 = LOAD256u( curInput1[argIndex]),\ - lanesL23 = LOAD256u( curInput2[argIndex]),\ - lanesH23 = LOAD256u( curInput3[argIndex]),\ - XOReq256( lanes0, lanesL01 ),\ - XOReq256( lanes1, lanesH01 ),\ - XOReq256( lanes2, lanesL23 ),\ - XOReq256( lanes3, lanesH23 ),\ - STORE256u( curOutput0[argIndex], lanes0 ),\ - STORE256u( curOutput1[argIndex], lanes1 ),\ - STORE256u( curOutput2[argIndex], lanes2 ),\ - STORE256u( curOutput3[argIndex], lanes3 ) - - if ( laneCount >= 16 ) { - ExtrXor4( 0 ); - ExtrXor4( 4 ); - ExtrXor4( 8 ); - ExtrXor4( 12 ); - if ( laneCount >= 20 ) { - ExtrXor4( 16 ); - for(i=20; i= (laneOffsetParallel*3 + laneCount)*8) { - V256 *stateAsLanes = (V256 *)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - #define Xor_In( argIndex ) \ - XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - #define Xor_In4( argIndex ) \ - lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ - XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ - XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ - XOReq256( stateAsLanes[argIndex+3], lanes3 ) - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - Xor_In4( 16 ); - Xor_In( 20 ); - #undef Xor_In - #undef Xor_In4 - KeccakP1600times4_PermuteAll_24rounds(states); - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - return (const unsigned char *)curData0 - dataStart; -#else -// unsigned int i; - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - V256 *statesAsLanes = (V256 *)states; - declareABCDE - - copyFromState(A, statesAsLanes) - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - #define XOR_In( Xxx, argIndex ) \ - XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - XOR_In( Aba, 0 ); - XOR_In( Abe, 1 ); - XOR_In( Abi, 2 ); - XOR_In( Abo, 3 ); - XOR_In( Abu, 4 ); - XOR_In( Aga, 5 ); - XOR_In( Age, 6 ); - XOR_In( Agi, 7 ); - XOR_In( Ago, 8 ); - XOR_In( Agu, 9 ); - XOR_In( Aka, 10 ); - XOR_In( Ake, 11 ); - XOR_In( Aki, 12 ); - XOR_In( Ako, 13 ); - XOR_In( Aku, 14 ); - XOR_In( Ama, 15 ); - XOR_In( Ame, 16 ); - XOR_In( Ami, 17 ); - XOR_In( Amo, 18 ); - XOR_In( Amu, 19 ); - XOR_In( Asa, 20 ); - #undef XOR_In - rounds24 - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - copyToState(statesAsLanes, A) - return (const unsigned char *)curData0 - dataStart; -#endif - } - else { -// unsigned int i; - const unsigned char *dataStart = data; - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel); - KeccakP1600times4_PermuteAll_24rounds(states); - data += laneOffsetSerial*8; - dataByteLen -= laneOffsetSerial*8; - } - return data - dataStart; - } -} - -size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen) -{ - if (laneCount == 21) { -#if 0 - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - V256 *stateAsLanes = states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - #define Xor_In( argIndex ) \ - XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - #define Xor_In4( argIndex ) \ - lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ - XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ - XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ - XOReq256( stateAsLanes[argIndex+3], lanes3 ) - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - Xor_In4( 16 ); - Xor_In( 20 ); - #undef Xor_In - #undef Xor_In4 - KeccakP1600times4_PermuteAll_12rounds(states); - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - return (const unsigned char *)curData0 - dataStart; -#else -// unsigned int i; - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - V256 *statesAsLanes = states; - declareABCDE - - copyFromState(A, statesAsLanes) - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - #define XOR_In( Xxx, argIndex ) \ - XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - XOR_In( Aba, 0 ); - XOR_In( Abe, 1 ); - XOR_In( Abi, 2 ); - XOR_In( Abo, 3 ); - XOR_In( Abu, 4 ); - XOR_In( Aga, 5 ); - XOR_In( Age, 6 ); - XOR_In( Agi, 7 ); - XOR_In( Ago, 8 ); - XOR_In( Agu, 9 ); - XOR_In( Aka, 10 ); - XOR_In( Ake, 11 ); - XOR_In( Aki, 12 ); - XOR_In( Ako, 13 ); - XOR_In( Aku, 14 ); - XOR_In( Ama, 15 ); - XOR_In( Ame, 16 ); - XOR_In( Ami, 17 ); - XOR_In( Amo, 18 ); - XOR_In( Amu, 19 ); - XOR_In( Asa, 20 ); - #undef XOR_In - rounds12 - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - copyToState(statesAsLanes, A) - return (const unsigned char *)curData0 - dataStart; -#endif - } - else { -// unsigned int i; - const unsigned char *dataStart = data; - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel); - KeccakP1600times4_PermuteAll_12rounds(states); - data += laneOffsetSerial*8; - dataByteLen -= laneOffsetSerial*8; - } - return data - dataStart; - } -} diff --git a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-times4-SnP.h b/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-times4-SnP.h deleted file mode 100644 index 4cc5d6a26..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-times4-SnP.h +++ /dev/null @@ -1,65 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaƫl Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#ifndef _KeccakP_1600_times4_SnP_h_ -#define _KeccakP_1600_times4_SnP_h_ - -/** For the documentation, see PlSnP-documentation.h. - */ - -#include "KeccakP-SIMD256-config.h" -#include "../fips202x4.h" - -#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" -#define KeccakP1600times4_statesSizeInBytes 800 -#define KeccakP1600times4_statesAlignment 32 -#define KeccakF1600times4_FastLoop_supported -#define KeccakP1600times4_12rounds_FastLoop_supported - -#include - -#define KeccakP1600times4_StaticInitialize() -#define KeccakP1600times4_InitializeAll FIPS202X4_NAMESPACE(KeccakP1600times4_InitializeAll) -void KeccakP1600times4_InitializeAll(void *states); -#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \ - ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte) -#define KeccakP1600times4_AddBytes FIPS202X4_NAMESPACE(KeccakP1600times4_AddBytes) -void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); -#define KeccakP1600times4_AddLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_AddLanesAll) -void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); -#define KeccakP1600times4_OverwriteBytes FIPS202X4_NAMESPACE(KeccakP1600times4_OverwriteBytes) -void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); -#define KeccakP1600times4_OverwriteLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_OverwriteLanesAll) -void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); -#define KeccakP1600times4_OverwriteWithZeroes FIPS202X4_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes) -void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount); -#define KeccakP1600times4_PermuteAll_12rounds FIPS202X4_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds) -void KeccakP1600times4_PermuteAll_12rounds(void *states); -#define KeccakP1600times4_PermuteAll_24rounds FIPS202X4_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) -void KeccakP1600times4_PermuteAll_24rounds(void *states); -#define KeccakP1600times4_ExtractBytes FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractBytes) -void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length); -#define KeccakP1600times4_ExtractLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractLanesAll) -void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset); -#define KeccakP1600times4_ExtractAndAddBytes FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes) -void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); -#define KeccakP1600times4_ExtractAndAddLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll) -void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); -#define KeccakF1600times4_FastLoop_Absorb FIPS202X4_NAMESPACE(KeccakF1600times4_FastLoop_Absorb) -size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); -#define KeccakP1600times4_12rounds_FastLoop_Absorb FIPS202X4_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb) -size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); - -#endif diff --git a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-unrolling.macros b/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-unrolling.macros deleted file mode 100644 index 3180bb063..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-1600-unrolling.macros +++ /dev/null @@ -1,198 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaƫl Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#if (defined(FullUnrolling)) -#define rounds24 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta( 0, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 1, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 2, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 3, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 4, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 5, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 6, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 7, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 8, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(10, A, E) \ - thetaRhoPiChiIotaPrepareTheta(11, E, A) \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 12) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=12) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 6) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#elif (Unrolling == 4) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#elif (Unrolling == 3) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#elif (Unrolling == 2) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#elif (Unrolling == 1) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#else -#error "Unrolling is not correctly specified!" -#endif - -#define roundsN(__nrounds) \ - prepareTheta \ - i = 24 - (__nrounds); \ - if ((i&1) != 0) { \ - thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - copyStateVariables(A, E) \ - ++i; \ - } \ - for( /* empty */; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } diff --git a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-SIMD256-config.h b/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-SIMD256-config.h deleted file mode 100644 index 1c65fe29b..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-SIMD256-config.h +++ /dev/null @@ -1,3 +0,0 @@ -#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled" -#define KeccakP1600times4_fullUnrolling -#define KeccakP1600times4_useAVX2 diff --git a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-align.h b/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-align.h deleted file mode 100644 index 9ee95ceeb..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-align.h +++ /dev/null @@ -1,34 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaƫl Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#ifndef _keccakp_align_h_ -#define _keccakp_align_h_ - -/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */ -#ifdef ALIGN -#undef ALIGN -#endif - -#if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) -#elif defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#elif defined(__ARMCC_VERSION) -#define ALIGN(x) __align(x) -#else -#define ALIGN(x) -#endif - -#endif diff --git a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-brg_endian.h b/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-brg_endian.h deleted file mode 100644 index f04027a1f..000000000 --- a/src/kem/kyber/pqcrystals-kyber_common_keccak4x_avx2/keccak4x/KeccakP-brg_endian.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The redistribution and use of this software (with or without changes) - is allowed without the payment of fees or royalties provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 20/12/2007 - Changes for ARM 9/9/2010 -*/ - -#ifndef _KECCAKP_BRG_ENDIAN_H -#define _KECCAKP_BRG_ENDIAN_H - -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ - -#if 0 -/* Include files where endian defines and byteswap functions may reside */ -#if defined( __sun ) -# include -#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) -# include -#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ - defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) -# include -#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) -# if !defined( __MINGW32__ ) && !defined( _AIX ) -# include -# if !defined( __BEOS__ ) -# include -# endif -# endif -#endif -#endif - -/* Now attempt to set the define for platform byte order using any */ -/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ -/* seem to encompass most endian symbol definitions */ - -#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) -# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) -# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( _BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( _LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) -# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) -# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -/* if the platform byte order could not be determined, then try to */ -/* set this define using common machine defines */ -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ - defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN - -#elif defined(__arm__) -# ifdef __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# else -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif 1 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#else -# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order -#endif - -#endif - -#endif diff --git a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/f1600x4.S b/src/sig/dilithium/pqcrystals-dilithium_common_avx2/f1600x4.S deleted file mode 100644 index 497b8cafa..000000000 --- a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/f1600x4.S +++ /dev/null @@ -1,907 +0,0 @@ -/* Taken from Bas Westerbaan's new 4-way SHAKE implementation - * for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/), - * but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */ - -#include "fips202x4.h" - -.data -.p2align 5 -rho8: -.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14 -rho56: -.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8 - -.text -.global cdecl(f1600x4) -cdecl(f1600x4): -vmovdqa rho8(%rip), %ymm0 -movq $6, %rax -looptop: -vmovdqa 0(%rdi), %ymm8 -vmovdqa 32(%rdi), %ymm9 -vmovdqa 64(%rdi), %ymm10 -vmovdqa 96(%rdi), %ymm11 -vmovdqa 128(%rdi), %ymm12 -vpxor 160(%rdi), %ymm8, %ymm8 -vpxor 192(%rdi), %ymm9, %ymm9 -vpxor 224(%rdi), %ymm10, %ymm10 -vpxor 256(%rdi), %ymm11, %ymm11 -vpxor 288(%rdi), %ymm12, %ymm12 -vpxor 320(%rdi), %ymm8, %ymm8 -vpxor 352(%rdi), %ymm9, %ymm9 -vpxor 384(%rdi), %ymm10, %ymm10 -vpxor 416(%rdi), %ymm11, %ymm11 -vpxor 448(%rdi), %ymm12, %ymm12 -vpxor 480(%rdi), %ymm8, %ymm8 -vpxor 512(%rdi), %ymm9, %ymm9 -vpxor 544(%rdi), %ymm10, %ymm10 -vpxor 576(%rdi), %ymm11, %ymm11 -vpxor 608(%rdi), %ymm12, %ymm12 -vpxor 640(%rdi), %ymm8, %ymm8 -vpxor 672(%rdi), %ymm9, %ymm9 -vpxor 704(%rdi), %ymm10, %ymm10 -vpxor 736(%rdi), %ymm11, %ymm11 -vpxor 768(%rdi), %ymm12, %ymm12 -vpsllq $1, %ymm9, %ymm13 -vpsllq $1, %ymm10, %ymm14 -vpsllq $1, %ymm11, %ymm15 -vpsllq $1, %ymm12, %ymm7 -vpsllq $1, %ymm8, %ymm6 -vpsrlq $63, %ymm9, %ymm5 -vpsrlq $63, %ymm10, %ymm4 -vpsrlq $63, %ymm11, %ymm3 -vpsrlq $63, %ymm12, %ymm2 -vpsrlq $63, %ymm8, %ymm1 -vpor %ymm13, %ymm5, %ymm5 -vpor %ymm14, %ymm4, %ymm4 -vpor %ymm15, %ymm3, %ymm3 -vpor %ymm7, %ymm2, %ymm2 -vpor %ymm6, %ymm1, %ymm1 -vpxor %ymm5, %ymm12, %ymm5 -vpxor %ymm4, %ymm8, %ymm4 -vpxor %ymm3, %ymm9, %ymm3 -vpxor %ymm2, %ymm10, %ymm2 -vpxor %ymm1, %ymm11, %ymm1 -vpxor 0(%rdi), %ymm5, %ymm8 -vpxor 192(%rdi), %ymm4, %ymm9 -vpxor 384(%rdi), %ymm3, %ymm10 -vpxor 576(%rdi), %ymm2, %ymm11 -vpxor 768(%rdi), %ymm1, %ymm12 -vpsllq $44, %ymm9, %ymm14 -vpsllq $43, %ymm10, %ymm15 -vpsllq $21, %ymm11, %ymm7 -vpsllq $14, %ymm12, %ymm6 -vpsrlq $20, %ymm9, %ymm9 -vpsrlq $21, %ymm10, %ymm10 -vpsrlq $43, %ymm11, %ymm11 -vpsrlq $50, %ymm12, %ymm12 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vpbroadcastq 0(%rsi), %ymm8 -vpxor %ymm8, %ymm13, %ymm13 -vmovdqa %ymm13, 0(%rdi) -vmovdqa %ymm14, 192(%rdi) -vmovdqa %ymm15, 384(%rdi) -vmovdqa %ymm7, 576(%rdi) -vmovdqa %ymm6, 768(%rdi) -vpxor 96(%rdi), %ymm2, %ymm8 -vpxor 288(%rdi), %ymm1, %ymm9 -vpxor 320(%rdi), %ymm5, %ymm10 -vpxor 512(%rdi), %ymm4, %ymm11 -vpxor 704(%rdi), %ymm3, %ymm12 -vpsllq $28, %ymm8, %ymm13 -vpsllq $20, %ymm9, %ymm14 -vpsllq $3, %ymm10, %ymm15 -vpsllq $45, %ymm11, %ymm7 -vpsllq $61, %ymm12, %ymm6 -vpsrlq $36, %ymm8, %ymm8 -vpsrlq $44, %ymm9, %ymm9 -vpsrlq $61, %ymm10, %ymm10 -vpsrlq $19, %ymm11, %ymm11 -vpsrlq $3, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 320(%rdi) -vmovdqa %ymm14, 512(%rdi) -vmovdqa %ymm15, 704(%rdi) -vmovdqa %ymm7, 96(%rdi) -vmovdqa %ymm6, 288(%rdi) -vpxor 32(%rdi), %ymm4, %ymm8 -vpxor 224(%rdi), %ymm3, %ymm9 -vpxor 416(%rdi), %ymm2, %ymm10 -vpxor 608(%rdi), %ymm1, %ymm11 -vpxor 640(%rdi), %ymm5, %ymm12 -vpsllq $1, %ymm8, %ymm13 -vpsllq $6, %ymm9, %ymm14 -vpsllq $25, %ymm10, %ymm15 -#vpsllq $8, %ymm11, %ymm7 -vpsllq $18, %ymm12, %ymm6 -vpsrlq $63, %ymm8, %ymm8 -vpsrlq $58, %ymm9, %ymm9 -vpsrlq $39, %ymm10, %ymm10 -#vpsrlq $56, %ymm11, %ymm11 -vpsrlq $46, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -#vpor %ymm7, %ymm11, %ymm11 -vpshufb %ymm0, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 640(%rdi) -vmovdqa %ymm14, 32(%rdi) -vmovdqa %ymm15, 224(%rdi) -vmovdqa %ymm7, 416(%rdi) -vmovdqa %ymm6, 608(%rdi) -vpxor 128(%rdi), %ymm1, %ymm8 -vpxor 160(%rdi), %ymm5, %ymm9 -vpxor 352(%rdi), %ymm4, %ymm10 -vpxor 544(%rdi), %ymm3, %ymm11 -vpxor 736(%rdi), %ymm2, %ymm12 -vpsllq $27, %ymm8, %ymm13 -vpsllq $36, %ymm9, %ymm14 -vpsllq $10, %ymm10, %ymm15 -vpsllq $15, %ymm11, %ymm7 -#vpsllq $56, %ymm12, %ymm6 -vpsrlq $37, %ymm8, %ymm8 -vpsrlq $28, %ymm9, %ymm9 -vpsrlq $54, %ymm10, %ymm10 -vpsrlq $49, %ymm11, %ymm11 -#vpsrlq $8, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -#vpor %ymm6, %ymm12, %ymm12 -vpshufb rho56(%rip), %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 160(%rdi) -vmovdqa %ymm14, 352(%rdi) -vmovdqa %ymm15, 544(%rdi) -vmovdqa %ymm7, 736(%rdi) -vmovdqa %ymm6, 128(%rdi) -vpxor 64(%rdi), %ymm3, %ymm8 -vpxor 256(%rdi), %ymm2, %ymm9 -vpxor 448(%rdi), %ymm1, %ymm10 -vpxor 480(%rdi), %ymm5, %ymm11 -vpxor 672(%rdi), %ymm4, %ymm12 -vpsllq $62, %ymm8, %ymm13 -vpsllq $55, %ymm9, %ymm14 -vpsllq $39, %ymm10, %ymm15 -vpsllq $41, %ymm11, %ymm7 -vpsllq $2, %ymm12, %ymm6 -vpsrlq $2, %ymm8, %ymm8 -vpsrlq $9, %ymm9, %ymm9 -vpsrlq $25, %ymm10, %ymm10 -vpsrlq $23, %ymm11, %ymm11 -vpsrlq $62, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 480(%rdi) -vmovdqa %ymm14, 672(%rdi) -vmovdqa %ymm15, 64(%rdi) -vmovdqa %ymm7, 256(%rdi) -vmovdqa %ymm6, 448(%rdi) -vmovdqa 0(%rdi), %ymm8 -vmovdqa 32(%rdi), %ymm9 -vmovdqa 64(%rdi), %ymm10 -vmovdqa 96(%rdi), %ymm11 -vmovdqa 128(%rdi), %ymm12 -vpxor 160(%rdi), %ymm8, %ymm8 -vpxor 192(%rdi), %ymm9, %ymm9 -vpxor 224(%rdi), %ymm10, %ymm10 -vpxor 256(%rdi), %ymm11, %ymm11 -vpxor 288(%rdi), %ymm12, %ymm12 -vpxor 320(%rdi), %ymm8, %ymm8 -vpxor 352(%rdi), %ymm9, %ymm9 -vpxor 384(%rdi), %ymm10, %ymm10 -vpxor 416(%rdi), %ymm11, %ymm11 -vpxor 448(%rdi), %ymm12, %ymm12 -vpxor 480(%rdi), %ymm8, %ymm8 -vpxor 512(%rdi), %ymm9, %ymm9 -vpxor 544(%rdi), %ymm10, %ymm10 -vpxor 576(%rdi), %ymm11, %ymm11 -vpxor 608(%rdi), %ymm12, %ymm12 -vpxor 640(%rdi), %ymm8, %ymm8 -vpxor 672(%rdi), %ymm9, %ymm9 -vpxor 704(%rdi), %ymm10, %ymm10 -vpxor 736(%rdi), %ymm11, %ymm11 -vpxor 768(%rdi), %ymm12, %ymm12 -vpsllq $1, %ymm9, %ymm13 -vpsllq $1, %ymm10, %ymm14 -vpsllq $1, %ymm11, %ymm15 -vpsllq $1, %ymm12, %ymm7 -vpsllq $1, %ymm8, %ymm6 -vpsrlq $63, %ymm9, %ymm5 -vpsrlq $63, %ymm10, %ymm4 -vpsrlq $63, %ymm11, %ymm3 -vpsrlq $63, %ymm12, %ymm2 -vpsrlq $63, %ymm8, %ymm1 -vpor %ymm13, %ymm5, %ymm5 -vpor %ymm14, %ymm4, %ymm4 -vpor %ymm15, %ymm3, %ymm3 -vpor %ymm7, %ymm2, %ymm2 -vpor %ymm6, %ymm1, %ymm1 -vpxor %ymm5, %ymm12, %ymm5 -vpxor %ymm4, %ymm8, %ymm4 -vpxor %ymm3, %ymm9, %ymm3 -vpxor %ymm2, %ymm10, %ymm2 -vpxor %ymm1, %ymm11, %ymm1 -vpxor 0(%rdi), %ymm5, %ymm8 -vpxor 512(%rdi), %ymm4, %ymm9 -vpxor 224(%rdi), %ymm3, %ymm10 -vpxor 736(%rdi), %ymm2, %ymm11 -vpxor 448(%rdi), %ymm1, %ymm12 -vpsllq $44, %ymm9, %ymm14 -vpsllq $43, %ymm10, %ymm15 -vpsllq $21, %ymm11, %ymm7 -vpsllq $14, %ymm12, %ymm6 -vpsrlq $20, %ymm9, %ymm9 -vpsrlq $21, %ymm10, %ymm10 -vpsrlq $43, %ymm11, %ymm11 -vpsrlq $50, %ymm12, %ymm12 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vpbroadcastq 8(%rsi), %ymm8 -vpxor %ymm8, %ymm13, %ymm13 -vmovdqa %ymm13, 0(%rdi) -vmovdqa %ymm14, 512(%rdi) -vmovdqa %ymm15, 224(%rdi) -vmovdqa %ymm7, 736(%rdi) -vmovdqa %ymm6, 448(%rdi) -vpxor 576(%rdi), %ymm2, %ymm8 -vpxor 288(%rdi), %ymm1, %ymm9 -vpxor 640(%rdi), %ymm5, %ymm10 -vpxor 352(%rdi), %ymm4, %ymm11 -vpxor 64(%rdi), %ymm3, %ymm12 -vpsllq $28, %ymm8, %ymm13 -vpsllq $20, %ymm9, %ymm14 -vpsllq $3, %ymm10, %ymm15 -vpsllq $45, %ymm11, %ymm7 -vpsllq $61, %ymm12, %ymm6 -vpsrlq $36, %ymm8, %ymm8 -vpsrlq $44, %ymm9, %ymm9 -vpsrlq $61, %ymm10, %ymm10 -vpsrlq $19, %ymm11, %ymm11 -vpsrlq $3, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 640(%rdi) -vmovdqa %ymm14, 352(%rdi) -vmovdqa %ymm15, 64(%rdi) -vmovdqa %ymm7, 576(%rdi) -vmovdqa %ymm6, 288(%rdi) -vpxor 192(%rdi), %ymm4, %ymm8 -vpxor 704(%rdi), %ymm3, %ymm9 -vpxor 416(%rdi), %ymm2, %ymm10 -vpxor 128(%rdi), %ymm1, %ymm11 -vpxor 480(%rdi), %ymm5, %ymm12 -vpsllq $1, %ymm8, %ymm13 -vpsllq $6, %ymm9, %ymm14 -vpsllq $25, %ymm10, %ymm15 -#vpsllq $8, %ymm11, %ymm7 -vpsllq $18, %ymm12, %ymm6 -vpsrlq $63, %ymm8, %ymm8 -vpsrlq $58, %ymm9, %ymm9 -vpsrlq $39, %ymm10, %ymm10 -#vpsrlq $56, %ymm11, %ymm11 -vpsrlq $46, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -#vpor %ymm7, %ymm11, %ymm11 -vpshufb %ymm0, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 480(%rdi) -vmovdqa %ymm14, 192(%rdi) -vmovdqa %ymm15, 704(%rdi) -vmovdqa %ymm7, 416(%rdi) -vmovdqa %ymm6, 128(%rdi) -vpxor 768(%rdi), %ymm1, %ymm8 -vpxor 320(%rdi), %ymm5, %ymm9 -vpxor 32(%rdi), %ymm4, %ymm10 -vpxor 544(%rdi), %ymm3, %ymm11 -vpxor 256(%rdi), %ymm2, %ymm12 -vpsllq $27, %ymm8, %ymm13 -vpsllq $36, %ymm9, %ymm14 -vpsllq $10, %ymm10, %ymm15 -vpsllq $15, %ymm11, %ymm7 -#vpsllq $56, %ymm12, %ymm6 -vpsrlq $37, %ymm8, %ymm8 -vpsrlq $28, %ymm9, %ymm9 -vpsrlq $54, %ymm10, %ymm10 -vpsrlq $49, %ymm11, %ymm11 -#vpsrlq $8, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -#vpor %ymm6, %ymm12, %ymm12 -vpshufb rho56(%rip), %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 320(%rdi) -vmovdqa %ymm14, 32(%rdi) -vmovdqa %ymm15, 544(%rdi) -vmovdqa %ymm7, 256(%rdi) -vmovdqa %ymm6, 768(%rdi) -vpxor 384(%rdi), %ymm3, %ymm8 -vpxor 96(%rdi), %ymm2, %ymm9 -vpxor 608(%rdi), %ymm1, %ymm10 -vpxor 160(%rdi), %ymm5, %ymm11 -vpxor 672(%rdi), %ymm4, %ymm12 -vpsllq $62, %ymm8, %ymm13 -vpsllq $55, %ymm9, %ymm14 -vpsllq $39, %ymm10, %ymm15 -vpsllq $41, %ymm11, %ymm7 -vpsllq $2, %ymm12, %ymm6 -vpsrlq $2, %ymm8, %ymm8 -vpsrlq $9, %ymm9, %ymm9 -vpsrlq $25, %ymm10, %ymm10 -vpsrlq $23, %ymm11, %ymm11 -vpsrlq $62, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 160(%rdi) -vmovdqa %ymm14, 672(%rdi) -vmovdqa %ymm15, 384(%rdi) -vmovdqa %ymm7, 96(%rdi) -vmovdqa %ymm6, 608(%rdi) -vmovdqa 0(%rdi), %ymm8 -vmovdqa 32(%rdi), %ymm9 -vmovdqa 64(%rdi), %ymm10 -vmovdqa 96(%rdi), %ymm11 -vmovdqa 128(%rdi), %ymm12 -vpxor 160(%rdi), %ymm8, %ymm8 -vpxor 192(%rdi), %ymm9, %ymm9 -vpxor 224(%rdi), %ymm10, %ymm10 -vpxor 256(%rdi), %ymm11, %ymm11 -vpxor 288(%rdi), %ymm12, %ymm12 -vpxor 320(%rdi), %ymm8, %ymm8 -vpxor 352(%rdi), %ymm9, %ymm9 -vpxor 384(%rdi), %ymm10, %ymm10 -vpxor 416(%rdi), %ymm11, %ymm11 -vpxor 448(%rdi), %ymm12, %ymm12 -vpxor 480(%rdi), %ymm8, %ymm8 -vpxor 512(%rdi), %ymm9, %ymm9 -vpxor 544(%rdi), %ymm10, %ymm10 -vpxor 576(%rdi), %ymm11, %ymm11 -vpxor 608(%rdi), %ymm12, %ymm12 -vpxor 640(%rdi), %ymm8, %ymm8 -vpxor 672(%rdi), %ymm9, %ymm9 -vpxor 704(%rdi), %ymm10, %ymm10 -vpxor 736(%rdi), %ymm11, %ymm11 -vpxor 768(%rdi), %ymm12, %ymm12 -vpsllq $1, %ymm9, %ymm13 -vpsllq $1, %ymm10, %ymm14 -vpsllq $1, %ymm11, %ymm15 -vpsllq $1, %ymm12, %ymm7 -vpsllq $1, %ymm8, %ymm6 -vpsrlq $63, %ymm9, %ymm5 -vpsrlq $63, %ymm10, %ymm4 -vpsrlq $63, %ymm11, %ymm3 -vpsrlq $63, %ymm12, %ymm2 -vpsrlq $63, %ymm8, %ymm1 -vpor %ymm13, %ymm5, %ymm5 -vpor %ymm14, %ymm4, %ymm4 -vpor %ymm15, %ymm3, %ymm3 -vpor %ymm7, %ymm2, %ymm2 -vpor %ymm6, %ymm1, %ymm1 -vpxor %ymm5, %ymm12, %ymm5 -vpxor %ymm4, %ymm8, %ymm4 -vpxor %ymm3, %ymm9, %ymm3 -vpxor %ymm2, %ymm10, %ymm2 -vpxor %ymm1, %ymm11, %ymm1 -vpxor 0(%rdi), %ymm5, %ymm8 -vpxor 352(%rdi), %ymm4, %ymm9 -vpxor 704(%rdi), %ymm3, %ymm10 -vpxor 256(%rdi), %ymm2, %ymm11 -vpxor 608(%rdi), %ymm1, %ymm12 -vpsllq $44, %ymm9, %ymm14 -vpsllq $43, %ymm10, %ymm15 -vpsllq $21, %ymm11, %ymm7 -vpsllq $14, %ymm12, %ymm6 -vpsrlq $20, %ymm9, %ymm9 -vpsrlq $21, %ymm10, %ymm10 -vpsrlq $43, %ymm11, %ymm11 -vpsrlq $50, %ymm12, %ymm12 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vpbroadcastq 16(%rsi), %ymm8 -vpxor %ymm8, %ymm13, %ymm13 -vmovdqa %ymm13, 0(%rdi) -vmovdqa %ymm14, 352(%rdi) -vmovdqa %ymm15, 704(%rdi) -vmovdqa %ymm7, 256(%rdi) -vmovdqa %ymm6, 608(%rdi) -vpxor 736(%rdi), %ymm2, %ymm8 -vpxor 288(%rdi), %ymm1, %ymm9 -vpxor 480(%rdi), %ymm5, %ymm10 -vpxor 32(%rdi), %ymm4, %ymm11 -vpxor 384(%rdi), %ymm3, %ymm12 -vpsllq $28, %ymm8, %ymm13 -vpsllq $20, %ymm9, %ymm14 -vpsllq $3, %ymm10, %ymm15 -vpsllq $45, %ymm11, %ymm7 -vpsllq $61, %ymm12, %ymm6 -vpsrlq $36, %ymm8, %ymm8 -vpsrlq $44, %ymm9, %ymm9 -vpsrlq $61, %ymm10, %ymm10 -vpsrlq $19, %ymm11, %ymm11 -vpsrlq $3, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 480(%rdi) -vmovdqa %ymm14, 32(%rdi) -vmovdqa %ymm15, 384(%rdi) -vmovdqa %ymm7, 736(%rdi) -vmovdqa %ymm6, 288(%rdi) -vpxor 512(%rdi), %ymm4, %ymm8 -vpxor 64(%rdi), %ymm3, %ymm9 -vpxor 416(%rdi), %ymm2, %ymm10 -vpxor 768(%rdi), %ymm1, %ymm11 -vpxor 160(%rdi), %ymm5, %ymm12 -vpsllq $1, %ymm8, %ymm13 -vpsllq $6, %ymm9, %ymm14 -vpsllq $25, %ymm10, %ymm15 -#vpsllq $8, %ymm11, %ymm7 -vpsllq $18, %ymm12, %ymm6 -vpsrlq $63, %ymm8, %ymm8 -vpsrlq $58, %ymm9, %ymm9 -vpsrlq $39, %ymm10, %ymm10 -#vpsrlq $56, %ymm11, %ymm11 -vpsrlq $46, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -#vpor %ymm7, %ymm11, %ymm11 -vpshufb %ymm0, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 160(%rdi) -vmovdqa %ymm14, 512(%rdi) -vmovdqa %ymm15, 64(%rdi) -vmovdqa %ymm7, 416(%rdi) -vmovdqa %ymm6, 768(%rdi) -vpxor 448(%rdi), %ymm1, %ymm8 -vpxor 640(%rdi), %ymm5, %ymm9 -vpxor 192(%rdi), %ymm4, %ymm10 -vpxor 544(%rdi), %ymm3, %ymm11 -vpxor 96(%rdi), %ymm2, %ymm12 -vpsllq $27, %ymm8, %ymm13 -vpsllq $36, %ymm9, %ymm14 -vpsllq $10, %ymm10, %ymm15 -vpsllq $15, %ymm11, %ymm7 -#vpsllq $56, %ymm12, %ymm6 -vpsrlq $37, %ymm8, %ymm8 -vpsrlq $28, %ymm9, %ymm9 -vpsrlq $54, %ymm10, %ymm10 -vpsrlq $49, %ymm11, %ymm11 -#vpsrlq $8, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -#vpor %ymm6, %ymm12, %ymm12 -vpshufb rho56(%rip), %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 640(%rdi) -vmovdqa %ymm14, 192(%rdi) -vmovdqa %ymm15, 544(%rdi) -vmovdqa %ymm7, 96(%rdi) -vmovdqa %ymm6, 448(%rdi) -vpxor 224(%rdi), %ymm3, %ymm8 -vpxor 576(%rdi), %ymm2, %ymm9 -vpxor 128(%rdi), %ymm1, %ymm10 -vpxor 320(%rdi), %ymm5, %ymm11 -vpxor 672(%rdi), %ymm4, %ymm12 -vpsllq $62, %ymm8, %ymm13 -vpsllq $55, %ymm9, %ymm14 -vpsllq $39, %ymm10, %ymm15 -vpsllq $41, %ymm11, %ymm7 -vpsllq $2, %ymm12, %ymm6 -vpsrlq $2, %ymm8, %ymm8 -vpsrlq $9, %ymm9, %ymm9 -vpsrlq $25, %ymm10, %ymm10 -vpsrlq $23, %ymm11, %ymm11 -vpsrlq $62, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 320(%rdi) -vmovdqa %ymm14, 672(%rdi) -vmovdqa %ymm15, 224(%rdi) -vmovdqa %ymm7, 576(%rdi) -vmovdqa %ymm6, 128(%rdi) -vmovdqa 0(%rdi), %ymm8 -vmovdqa 32(%rdi), %ymm9 -vmovdqa 64(%rdi), %ymm10 -vmovdqa 96(%rdi), %ymm11 -vmovdqa 128(%rdi), %ymm12 -vpxor 160(%rdi), %ymm8, %ymm8 -vpxor 192(%rdi), %ymm9, %ymm9 -vpxor 224(%rdi), %ymm10, %ymm10 -vpxor 256(%rdi), %ymm11, %ymm11 -vpxor 288(%rdi), %ymm12, %ymm12 -vpxor 320(%rdi), %ymm8, %ymm8 -vpxor 352(%rdi), %ymm9, %ymm9 -vpxor 384(%rdi), %ymm10, %ymm10 -vpxor 416(%rdi), %ymm11, %ymm11 -vpxor 448(%rdi), %ymm12, %ymm12 -vpxor 480(%rdi), %ymm8, %ymm8 -vpxor 512(%rdi), %ymm9, %ymm9 -vpxor 544(%rdi), %ymm10, %ymm10 -vpxor 576(%rdi), %ymm11, %ymm11 -vpxor 608(%rdi), %ymm12, %ymm12 -vpxor 640(%rdi), %ymm8, %ymm8 -vpxor 672(%rdi), %ymm9, %ymm9 -vpxor 704(%rdi), %ymm10, %ymm10 -vpxor 736(%rdi), %ymm11, %ymm11 -vpxor 768(%rdi), %ymm12, %ymm12 -vpsllq $1, %ymm9, %ymm13 -vpsllq $1, %ymm10, %ymm14 -vpsllq $1, %ymm11, %ymm15 -vpsllq $1, %ymm12, %ymm7 -vpsllq $1, %ymm8, %ymm6 -vpsrlq $63, %ymm9, %ymm5 -vpsrlq $63, %ymm10, %ymm4 -vpsrlq $63, %ymm11, %ymm3 -vpsrlq $63, %ymm12, %ymm2 -vpsrlq $63, %ymm8, %ymm1 -vpor %ymm13, %ymm5, %ymm5 -vpor %ymm14, %ymm4, %ymm4 -vpor %ymm15, %ymm3, %ymm3 -vpor %ymm7, %ymm2, %ymm2 -vpor %ymm6, %ymm1, %ymm1 -vpxor %ymm5, %ymm12, %ymm5 -vpxor %ymm4, %ymm8, %ymm4 -vpxor %ymm3, %ymm9, %ymm3 -vpxor %ymm2, %ymm10, %ymm2 -vpxor %ymm1, %ymm11, %ymm1 -vpxor 0(%rdi), %ymm5, %ymm8 -vpxor 32(%rdi), %ymm4, %ymm9 -vpxor 64(%rdi), %ymm3, %ymm10 -vpxor 96(%rdi), %ymm2, %ymm11 -vpxor 128(%rdi), %ymm1, %ymm12 -vpsllq $44, %ymm9, %ymm14 -vpsllq $43, %ymm10, %ymm15 -vpsllq $21, %ymm11, %ymm7 -vpsllq $14, %ymm12, %ymm6 -vpsrlq $20, %ymm9, %ymm9 -vpsrlq $21, %ymm10, %ymm10 -vpsrlq $43, %ymm11, %ymm11 -vpsrlq $50, %ymm12, %ymm12 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vpbroadcastq 24(%rsi), %ymm8 -vpxor %ymm8, %ymm13, %ymm13 -vmovdqa %ymm13, 0(%rdi) -vmovdqa %ymm14, 32(%rdi) -vmovdqa %ymm15, 64(%rdi) -vmovdqa %ymm7, 96(%rdi) -vmovdqa %ymm6, 128(%rdi) -vpxor 256(%rdi), %ymm2, %ymm8 -vpxor 288(%rdi), %ymm1, %ymm9 -vpxor 160(%rdi), %ymm5, %ymm10 -vpxor 192(%rdi), %ymm4, %ymm11 -vpxor 224(%rdi), %ymm3, %ymm12 -vpsllq $28, %ymm8, %ymm13 -vpsllq $20, %ymm9, %ymm14 -vpsllq $3, %ymm10, %ymm15 -vpsllq $45, %ymm11, %ymm7 -vpsllq $61, %ymm12, %ymm6 -vpsrlq $36, %ymm8, %ymm8 -vpsrlq $44, %ymm9, %ymm9 -vpsrlq $61, %ymm10, %ymm10 -vpsrlq $19, %ymm11, %ymm11 -vpsrlq $3, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 160(%rdi) -vmovdqa %ymm14, 192(%rdi) -vmovdqa %ymm15, 224(%rdi) -vmovdqa %ymm7, 256(%rdi) -vmovdqa %ymm6, 288(%rdi) -vpxor 352(%rdi), %ymm4, %ymm8 -vpxor 384(%rdi), %ymm3, %ymm9 -vpxor 416(%rdi), %ymm2, %ymm10 -vpxor 448(%rdi), %ymm1, %ymm11 -vpxor 320(%rdi), %ymm5, %ymm12 -vpsllq $1, %ymm8, %ymm13 -vpsllq $6, %ymm9, %ymm14 -vpsllq $25, %ymm10, %ymm15 -#vpsllq $8, %ymm11, %ymm7 -vpsllq $18, %ymm12, %ymm6 -vpsrlq $63, %ymm8, %ymm8 -vpsrlq $58, %ymm9, %ymm9 -vpsrlq $39, %ymm10, %ymm10 -#vpsrlq $56, %ymm11, %ymm11 -vpsrlq $46, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -#vpor %ymm7, %ymm11, %ymm11 -vpshufb %ymm0, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 320(%rdi) -vmovdqa %ymm14, 352(%rdi) -vmovdqa %ymm15, 384(%rdi) -vmovdqa %ymm7, 416(%rdi) -vmovdqa %ymm6, 448(%rdi) -vpxor 608(%rdi), %ymm1, %ymm8 -vpxor 480(%rdi), %ymm5, %ymm9 -vpxor 512(%rdi), %ymm4, %ymm10 -vpxor 544(%rdi), %ymm3, %ymm11 -vpxor 576(%rdi), %ymm2, %ymm12 -vpsllq $27, %ymm8, %ymm13 -vpsllq $36, %ymm9, %ymm14 -vpsllq $10, %ymm10, %ymm15 -vpsllq $15, %ymm11, %ymm7 -#vpsllq $56, %ymm12, %ymm6 -vpsrlq $37, %ymm8, %ymm8 -vpsrlq $28, %ymm9, %ymm9 -vpsrlq $54, %ymm10, %ymm10 -vpsrlq $49, %ymm11, %ymm11 -#vpsrlq $8, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -#vpor %ymm6, %ymm12, %ymm12 -vpshufb rho56(%rip), %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 480(%rdi) -vmovdqa %ymm14, 512(%rdi) -vmovdqa %ymm15, 544(%rdi) -vmovdqa %ymm7, 576(%rdi) -vmovdqa %ymm6, 608(%rdi) -vpxor 704(%rdi), %ymm3, %ymm8 -vpxor 736(%rdi), %ymm2, %ymm9 -vpxor 768(%rdi), %ymm1, %ymm10 -vpxor 640(%rdi), %ymm5, %ymm11 -vpxor 672(%rdi), %ymm4, %ymm12 -vpsllq $62, %ymm8, %ymm13 -vpsllq $55, %ymm9, %ymm14 -vpsllq $39, %ymm10, %ymm15 -vpsllq $41, %ymm11, %ymm7 -vpsllq $2, %ymm12, %ymm6 -vpsrlq $2, %ymm8, %ymm8 -vpsrlq $9, %ymm9, %ymm9 -vpsrlq $25, %ymm10, %ymm10 -vpsrlq $23, %ymm11, %ymm11 -vpsrlq $62, %ymm12, %ymm12 -vpor %ymm13, %ymm8, %ymm8 -vpor %ymm14, %ymm9, %ymm9 -vpor %ymm15, %ymm10, %ymm10 -vpor %ymm7, %ymm11, %ymm11 -vpor %ymm6, %ymm12, %ymm12 -vpandn %ymm10, %ymm9, %ymm13 -vpandn %ymm11, %ymm10, %ymm14 -vpandn %ymm12, %ymm11, %ymm15 -vpandn %ymm8, %ymm12, %ymm7 -vpandn %ymm9, %ymm8, %ymm6 -vpxor %ymm8, %ymm13, %ymm13 -vpxor %ymm9, %ymm14, %ymm14 -vpxor %ymm10, %ymm15, %ymm15 -vpxor %ymm11, %ymm7, %ymm7 -vpxor %ymm12, %ymm6, %ymm6 -vmovdqa %ymm13, 640(%rdi) -vmovdqa %ymm14, 672(%rdi) -vmovdqa %ymm15, 704(%rdi) -vmovdqa %ymm7, 736(%rdi) -vmovdqa %ymm6, 768(%rdi) -addq $32, %rsi -subq $1, %rax -jnz looptop -ret diff --git a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202.c b/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202.c deleted file mode 100644 index 2afe799ea..000000000 --- a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202.c +++ /dev/null @@ -1,774 +0,0 @@ -/* Based on the public domain implementation in crypto_hash/keccakc512/simple/ from - * http://bench.cr.yp.to/supercop.html by Ronny Van Keer and the public domain "TweetFips202" - * implementation from https://twitter.com/tweetfips202 by Gilles Van Assche, Daniel J. Bernstein, - * and Peter Schwabe */ - -#include -#include -#include "fips202.h" - -#define NROUNDS 24 -#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) - -/************************************************* -* Name: load64 -* -* Description: Load 8 bytes into uint64_t in little-endian order -* -* Arguments: - const uint8_t *x: pointer to input byte array -* -* Returns the loaded 64-bit unsigned integer -**************************************************/ -static uint64_t load64(const uint8_t x[8]) { - unsigned int i; - uint64_t r = 0; - - for(i=0;i<8;i++) - r |= (uint64_t)x[i] << 8*i; - - return r; -} - -/************************************************* -* Name: store64 -* -* Description: Store a 64-bit integer to array of 8 bytes in little-endian order -* -* Arguments: - uint8_t *x: pointer to the output byte array (allocated) -* - uint64_t u: input 64-bit unsigned integer -**************************************************/ -static void store64(uint8_t x[8], uint64_t u) { - unsigned int i; - - for(i=0;i<8;i++) - x[i] = u >> 8*i; -} - -/* Keccak round constants */ -const uint64_t KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermute -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -static void KeccakF1600_StatePermute(uint64_t state[25]) -{ - int round; - - uint64_t Aba, Abe, Abi, Abo, Abu; - uint64_t Aga, Age, Agi, Ago, Agu; - uint64_t Aka, Ake, Aki, Ako, Aku; - uint64_t Ama, Ame, Ami, Amo, Amu; - uint64_t Asa, Ase, Asi, Aso, Asu; - uint64_t BCa, BCe, BCi, BCo, BCu; - uint64_t Da, De, Di, Do, Du; - uint64_t Eba, Ebe, Ebi, Ebo, Ebu; - uint64_t Ega, Ege, Egi, Ego, Egu; - uint64_t Eka, Eke, Eki, Eko, Eku; - uint64_t Ema, Eme, Emi, Emo, Emu; - uint64_t Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[ 0]; - Abe = state[ 1]; - Abi = state[ 2]; - Abo = state[ 3]; - Abu = state[ 4]; - Aga = state[ 5]; - Age = state[ 6]; - Agi = state[ 7]; - Ago = state[ 8]; - Agu = state[ 9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for(round = 0; round < NROUNDS; round += 2) { - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round, A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= (uint64_t)KeccakF_RoundConstants[round]; - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - } - - //copyToState(state, A) - state[ 0] = Aba; - state[ 1] = Abe; - state[ 2] = Abi; - state[ 3] = Abo; - state[ 4] = Abu; - state[ 5] = Aga; - state[ 6] = Age; - state[ 7] = Agi; - state[ 8] = Ago; - state[ 9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; -} - -/************************************************* -* Name: keccak_init -* -* Description: Initializes the Keccak state. -* -* Arguments: - uint64_t *s: pointer to Keccak state -**************************************************/ -static void keccak_init(uint64_t s[25]) -{ - unsigned int i; - for(i=0;i<25;i++) - s[i] = 0; -} - -/************************************************* -* Name: keccak_absorb -* -* Description: Absorb step of Keccak; incremental. -* -* Arguments: - uint64_t *s: pointer to Keccak state -* - unsigned int pos: position in current block to be absorbed -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -* -* Returns new position pos in current block -**************************************************/ -static unsigned int keccak_absorb(uint64_t s[25], - unsigned int pos, - unsigned int r, - const uint8_t *in, - size_t inlen) -{ - unsigned int i; - - while(pos+inlen >= r) { - for(i=pos;i> 8*(i%8); - outlen -= i-pos; - pos = i; - } - - return pos; -} - - -/************************************************* -* Name: keccak_absorb_once -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -* - uint8_t p: domain-separation byte for different Keccak-derived functions -**************************************************/ -static void keccak_absorb_once(uint64_t s[25], - unsigned int r, - const uint8_t *in, - size_t inlen, - uint8_t p) -{ - unsigned int i; - - for(i=0;i<25;i++) - s[i] = 0; - - while(inlen >= r) { - for(i=0;is); - state->pos = 0; -} - -/************************************************* -* Name: shake128_absorb -* -* Description: Absorb step of the SHAKE128 XOF; incremental. -* -* Arguments: - keccak_state *state: pointer to (initialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen) -{ - state->pos = keccak_absorb(state->s, state->pos, SHAKE128_RATE, in, inlen); -} - -/************************************************* -* Name: shake128_finalize -* -* Description: Finalize absorb step of the SHAKE128 XOF. -* -* Arguments: - keccak_state *state: pointer to Keccak state -**************************************************/ -void shake128_finalize(keccak_state *state) -{ - keccak_finalize(state->s, state->pos, SHAKE128_RATE, 0x1F); - state->pos = SHAKE128_RATE; -} - -/************************************************* -* Name: shake128_squeeze -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes arbitraily many -* bytes. Can be called multiple times to keep squeezing. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t outlen : number of bytes to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state) -{ - state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE128_RATE); -} - -/************************************************* -* Name: shake128_absorb_once -* -* Description: Initialize, absorb into and finalize SHAKE128 XOF; non-incremental. -* -* Arguments: - keccak_state *state: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen) -{ - keccak_absorb_once(state->s, SHAKE128_RATE, in, inlen, 0x1F); - state->pos = SHAKE128_RATE; -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Can be called multiple times -* to keep squeezing. Assumes new block has not yet been -* started (state->pos = SHAKE128_RATE). -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state) -{ - keccak_squeezeblocks(out, nblocks, state->s, SHAKE128_RATE); -} - -/************************************************* -* Name: shake256_init -* -* Description: Initilizes Keccak state for use as SHAKE256 XOF -* -* Arguments: - keccak_state *state: pointer to (uninitialized) Keccak state -**************************************************/ -void shake256_init(keccak_state *state) -{ - keccak_init(state->s); - state->pos = 0; -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF; incremental. -* -* Arguments: - keccak_state *state: pointer to (initialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen) -{ - state->pos = keccak_absorb(state->s, state->pos, SHAKE256_RATE, in, inlen); -} - -/************************************************* -* Name: shake256_finalize -* -* Description: Finalize absorb step of the SHAKE256 XOF. -* -* Arguments: - keccak_state *state: pointer to Keccak state -**************************************************/ -void shake256_finalize(keccak_state *state) -{ - keccak_finalize(state->s, state->pos, SHAKE256_RATE, 0x1F); - state->pos = SHAKE256_RATE; -} - -/************************************************* -* Name: shake256_squeeze -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes arbitraily many -* bytes. Can be called multiple times to keep squeezing. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t outlen : number of bytes to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state) -{ - state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE256_RATE); -} - -/************************************************* -* Name: shake256_absorb_once -* -* Description: Initialize, absorb into and finalize SHAKE256 XOF; non-incremental. -* -* Arguments: - keccak_state *state: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen) -{ - keccak_absorb_once(state->s, SHAKE256_RATE, in, inlen, 0x1F); - state->pos = SHAKE256_RATE; -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Can be called multiple times -* to keep squeezing. Assumes next block has not yet been -* started (state->pos = SHAKE256_RATE). -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state) -{ - keccak_squeezeblocks(out, nblocks, state->s, SHAKE256_RATE); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen) -{ - size_t nblocks; - keccak_state state; - - shake128_absorb_once(&state, in, inlen); - nblocks = outlen/SHAKE128_RATE; - shake128_squeezeblocks(out, nblocks, &state); - outlen -= nblocks*SHAKE128_RATE; - out += nblocks*SHAKE128_RATE; - shake128_squeeze(out, outlen, &state); -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen) -{ - size_t nblocks; - keccak_state state; - - shake256_absorb_once(&state, in, inlen); - nblocks = outlen/SHAKE256_RATE; - shake256_squeezeblocks(out, nblocks, &state); - outlen -= nblocks*SHAKE256_RATE; - out += nblocks*SHAKE256_RATE; - shake256_squeeze(out, outlen, &state); -} - -/************************************************* -* Name: sha3_256 -* -* Description: SHA3-256 with non-incremental API -* -* Arguments: - uint8_t *h: pointer to output (32 bytes) -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen) -{ - unsigned int i; - uint64_t s[25]; - - keccak_absorb_once(s, SHA3_256_RATE, in, inlen, 0x06); - KeccakF1600_StatePermute(s); - for(i=0;i<4;i++) - store64(h+8*i,s[i]); -} - -/************************************************* -* Name: sha3_512 -* -* Description: SHA3-512 with non-incremental API -* -* Arguments: - uint8_t *h: pointer to output (64 bytes) -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen) -{ - unsigned int i; - uint64_t s[25]; - - keccak_absorb_once(s, SHA3_512_RATE, in, inlen, 0x06); - KeccakF1600_StatePermute(s); - for(i=0;i<8;i++) - store64(h+8*i,s[i]); -} diff --git a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202.h b/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202.h deleted file mode 100644 index 72fb2c242..000000000 --- a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef FIPS202_H -#define FIPS202_H - -#include -#include - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -#define FIPS202_NAMESPACE(s) pqcrystals_dilithium_fips202_avx2_##s - -typedef struct { - uint64_t s[25]; - unsigned int pos; -} keccak_state; - -#define KeccakF_RoundConstants FIPS202_NAMESPACE(KeccakF_RoundConstants) -extern const uint64_t KeccakF_RoundConstants[]; - -#define shake128_init FIPS202_NAMESPACE(shake128_init) -void shake128_init(keccak_state *state); -#define shake128_absorb FIPS202_NAMESPACE(shake128_absorb) -void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake128_finalize FIPS202_NAMESPACE(shake128_finalize) -void shake128_finalize(keccak_state *state); -#define shake128_squeeze FIPS202_NAMESPACE(shake128_squeeze) -void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state); -#define shake128_absorb_once FIPS202_NAMESPACE(shake128_absorb_once) -void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake128_squeezeblocks FIPS202_NAMESPACE(shake128_squeezeblocks) -void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state); - -#define shake256_init FIPS202_NAMESPACE(shake256_init) -void shake256_init(keccak_state *state); -#define shake256_absorb FIPS202_NAMESPACE(shake256_absorb) -void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake256_finalize FIPS202_NAMESPACE(shake256_finalize) -void shake256_finalize(keccak_state *state); -#define shake256_squeeze FIPS202_NAMESPACE(shake256_squeeze) -void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state); -#define shake256_absorb_once FIPS202_NAMESPACE(shake256_absorb_once) -void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen); -#define shake256_squeezeblocks FIPS202_NAMESPACE(shake256_squeezeblocks) -void shake256_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state); - -#define shake128 FIPS202_NAMESPACE(shake128) -void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen); -#define shake256 FIPS202_NAMESPACE(shake256) -void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen); -#define sha3_256 FIPS202_NAMESPACE(sha3_256) -void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen); -#define sha3_512 FIPS202_NAMESPACE(sha3_512) -void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen); - -#endif diff --git a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202x4.c b/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202x4.c deleted file mode 100644 index 2ffa69102..000000000 --- a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202x4.c +++ /dev/null @@ -1,196 +0,0 @@ -#include -#include -#include -#include -#include "fips202.h" -#include "fips202x4.h" - -static void keccakx4_absorb_once(__m256i s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen, - uint8_t p) -{ - size_t i; - uint64_t pos = 0; - __m256i t, idx; - - for(i = 0; i < 25; ++i) - s[i] = _mm256_setzero_si256(); - - idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); - while(inlen >= r) { - for(i = 0; i < r/8; ++i) { - t = _mm256_i64gather_epi64((long long *)pos, idx, 1); - s[i] = _mm256_xor_si256(s[i], t); - pos += 8; - } - inlen -= r; - - f1600x4(s, KeccakF_RoundConstants); - } - - for(i = 0; i < inlen/8; ++i) { - t = _mm256_i64gather_epi64((long long *)pos, idx, 1); - s[i] = _mm256_xor_si256(s[i], t); - pos += 8; - } - inlen -= 8*i; - - if(inlen) { - t = _mm256_i64gather_epi64((long long *)pos, idx, 1); - idx = _mm256_set1_epi64x((1ULL << (8*inlen)) - 1); - t = _mm256_and_si256(t, idx); - s[i] = _mm256_xor_si256(s[i], t); - } - - t = _mm256_set1_epi64x((uint64_t)p << 8*inlen); - s[i] = _mm256_xor_si256(s[i], t); - t = _mm256_set1_epi64x(1ULL << 63); - s[r/8 - 1] = _mm256_xor_si256(s[r/8 - 1], t); -} - -static void keccakx4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - unsigned int r, - __m256i s[25]) -{ - unsigned int i; - __m128d t; - - while(nblocks > 0) { - f1600x4(s, KeccakF_RoundConstants); - for(i=0; i < r/8; ++i) { - t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); - _mm_storel_pd((__attribute__((__may_alias__)) double *)&out0[8*i], t); - _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out1[8*i], t); - t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1)); - _mm_storel_pd((__attribute__((__may_alias__)) double *)&out2[8*i], t); - _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out3[8*i], t); - } - - out0 += r; - out1 += r; - out2 += r; - out3 += r; - --nblocks; - } -} - -void shake128x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); -} - -void shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state) -{ - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); -} - -void shake256x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); -} - -void shake256x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state) -{ - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); -} - -void shake128x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - unsigned int i; - size_t nblocks = outlen/SHAKE128_RATE; - uint8_t t[4][SHAKE128_RATE]; - keccakx4_state state; - - shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); - shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - - out0 += nblocks*SHAKE128_RATE; - out1 += nblocks*SHAKE128_RATE; - out2 += nblocks*SHAKE128_RATE; - out3 += nblocks*SHAKE128_RATE; - outlen -= nblocks*SHAKE128_RATE; - - if(outlen) { - shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); - for(i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - out2[i] = t[2][i]; - out3[i] = t[3][i]; - } - } -} - -void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen) -{ - unsigned int i; - size_t nblocks = outlen/SHAKE256_RATE; - uint8_t t[4][SHAKE256_RATE]; - keccakx4_state state; - - shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); - shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - - out0 += nblocks*SHAKE256_RATE; - out1 += nblocks*SHAKE256_RATE; - out2 += nblocks*SHAKE256_RATE; - out3 += nblocks*SHAKE256_RATE; - outlen -= nblocks*SHAKE256_RATE; - - if(outlen) { - shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); - for(i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - out2[i] = t[2][i]; - out3[i] = t[3][i]; - } - } -} diff --git a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202x4.h b/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202x4.h deleted file mode 100644 index 3288a3a3f..000000000 --- a/src/sig/dilithium/pqcrystals-dilithium_common_avx2/fips202x4.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef FIPS202X4_H -#define FIPS202X4_H - -#define FIPS202X4_NAMESPACE(s) pqcrystals_dilithium_fips202x4_avx2_##s - -#ifdef __ASSEMBLER__ -/* The C ABI on MacOS exports all symbols with a leading - * underscore. This means that any symbols we refer to from - * C files (functions) can't be found, and all symbols we - * refer to from ASM also can't be found. - * - * This define helps us get around this - */ -#if defined(__WIN32__) || defined(__APPLE__) -#define decorate(s) _##s -#define _cdecl(s) decorate(s) -#define cdecl(s) _cdecl(FIPS202X4_NAMESPACE(##s)) -#else -#define cdecl(s) FIPS202X4_NAMESPACE(##s) -#endif - -#else -#include -#include -#include - -typedef struct { - __m256i s[25]; -} keccakx4_state; - -#define f1600x4 FIPS202X4_NAMESPACE(f1600x4) -void f1600x4(__m256i *s, const uint64_t *rc); - -#define shake128x4_absorb_once FIPS202X4_NAMESPACE(shake128x4_absorb_once) -void shake128x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake128x4_squeezeblocks FIPS202X4_NAMESPACE(shake128x4_squeezeblocks) -void shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state); - -#define shake256x4_absorb_once FIPS202X4_NAMESPACE(shake256x4_absorb_once) -void shake256x4_absorb_once(keccakx4_state *state, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake256x4_squeezeblocks FIPS202X4_NAMESPACE(shake256x4_squeezeblocks) -void shake256x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccakx4_state *state); - -#define shake128x4 FIPS202X4_NAMESPACE(shake128x4) -void shake128x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#define shake256x4 FIPS202X4_NAMESPACE(shake256x4) -void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen); - -#endif -#endif