diff --git a/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c b/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c old mode 100755 new mode 100644 index f5461d448..3ce581937 --- a/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c +++ b/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c @@ -36,295 +36,293 @@ typedef unsigned long long int UINT64; typedef __m128i V128; typedef __m256i V256; -#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex) +#define laneIndex(instanceIndex, lanePosition) ((lanePosition) *4 + instanceIndex) #if defined(KeccakP1600times4_useAVX2) - #define ANDnu256(a, b) _mm256_andnot_si256(a, b) - #define CONST256(a) _mm256_load_si256((const V256 *)&(a)) - #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a)) - #define LOAD256(a) _mm256_load_si256((const V256 *)&(a)) - #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a)) - #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d)) - #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) - #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) - #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) +#define ANDnu256(a, b) _mm256_andnot_si256(a, b) +#define CONST256(a) _mm256_load_si256((const V256 *) &(a)) +#define CONST256_64(a) (V256) _mm256_broadcast_sd((const double *) (&a)) +#define LOAD256(a) _mm256_load_si256((const V256 *) &(a)) +#define LOAD256u(a) _mm256_loadu_si256((const V256 *) &(a)) +#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d)) +#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64 - (o))) +#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) +#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; - #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b) - #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b) - #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v) - #define XOR256(a, b) _mm256_xor_si256(a, b) - #define XOReq256(a, b) a = _mm256_xor_si256(a, b) - #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) - #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) - #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) - #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) +#define STORE256(a, b) _mm256_store_si256((V256 *) &(a), b) +#define STORE256u(a, b) _mm256_storeu_si256((V256 *) &(a), b) +#define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128 *) &(ah), (V128 *) &(al), v) +#define XOR256(a, b) _mm256_xor_si256(a, b) +#define XOReq256(a, b) a = _mm256_xor_si256(a, b) +#define UNPACKL(a, b) _mm256_unpacklo_epi64((a), (b)) +#define UNPACKH(a, b) _mm256_unpackhi_epi64((a), (b)) +#define PERM128(a, b, c) (V256) _mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) +#define SHUFFLE64(a, b, c) (V256) _mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) - #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \ - lanesH01 = UNPACKH( lanes0, lanes1 ), \ - lanesL23 = UNPACKL( lanes2, lanes3 ), \ - lanesH23 = UNPACKH( lanes2, lanes3 ), \ - lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \ - lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \ - lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \ - lanes3 = PERM128( lanesH01, lanesH23, 0x31 ) +#define UNINTLEAVE() lanesL01 = UNPACKL(lanes0, lanes1), \ + lanesH01 = UNPACKH(lanes0, lanes1), \ + lanesL23 = UNPACKL(lanes2, lanes3), \ + lanesH23 = UNPACKH(lanes2, lanes3), \ + lanes0 = PERM128(lanesL01, lanesL23, 0x20), \ + lanes2 = PERM128(lanesL01, lanesL23, 0x31), \ + lanes1 = PERM128(lanesH01, lanesH23, 0x20), \ + lanes3 = PERM128(lanesH01, lanesH23, 0x31) - #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \ - lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \ - lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \ - lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \ - lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \ - lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \ - lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \ - lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F ) +#define INTLEAVE() lanesL01 = PERM128(lanes0, lanes2, 0x20), \ + lanesH01 = PERM128(lanes1, lanes3, 0x20), \ + lanesL23 = PERM128(lanes0, lanes2, 0x31), \ + lanesH23 = PERM128(lanes1, lanes3, 0x31), \ + lanes0 = SHUFFLE64(lanesL01, lanesH01, 0x00), \ + lanes1 = SHUFFLE64(lanesL01, lanesH01, 0x0F), \ + lanes2 = SHUFFLE64(lanesL23, lanesH23, 0x00), \ + lanes3 = SHUFFLE64(lanesL23, lanesH23, 0x0F) #endif #define SnP_laneLengthInBytes 8 -#define declareABCDE \ - V256 Aba, Abe, Abi, Abo, Abu; \ - V256 Aga, Age, Agi, Ago, Agu; \ - V256 Aka, Ake, Aki, Ako, Aku; \ - V256 Ama, Ame, Ami, Amo, Amu; \ - V256 Asa, Ase, Asi, Aso, Asu; \ - V256 Bba, Bbe, Bbi, Bbo, Bbu; \ - V256 Bga, Bge, Bgi, Bgo, Bgu; \ - V256 Bka, Bke, Bki, Bko, Bku; \ - V256 Bma, Bme, Bmi, Bmo, Bmu; \ - V256 Bsa, Bse, Bsi, Bso, Bsu; \ - V256 Ca, Ce, Ci, Co, Cu; \ - V256 Ca1, Ce1, Ci1, Co1, Cu1; \ - V256 Da, De, Di, Do, Du; \ - V256 Eba, Ebe, Ebi, Ebo, Ebu; \ - V256 Ega, Ege, Egi, Ego, Egu; \ - V256 Eka, Eke, Eki, Eko, Eku; \ - V256 Ema, Eme, Emi, Emo, Emu; \ - V256 Esa, Ese, Esi, Eso, Esu; \ +#define declareABCDE \ + V256 Aba, Abe, Abi, Abo, Abu; \ + V256 Aga, Age, Agi, Ago, Agu; \ + V256 Aka, Ake, Aki, Ako, Aku; \ + V256 Ama, Ame, Ami, Amo, Amu; \ + V256 Asa, Ase, Asi, Aso, Asu; \ + V256 Bba, Bbe, Bbi, Bbo, Bbu; \ + V256 Bga, Bge, Bgi, Bgo, Bgu; \ + V256 Bka, Bke, Bki, Bko, Bku; \ + V256 Bma, Bme, Bmi, Bmo, Bmu; \ + V256 Bsa, Bse, Bsi, Bso, Bsu; \ + V256 Ca, Ce, Ci, Co, Cu; \ + V256 Ca1, Ce1, Ci1, Co1, Cu1; \ + V256 Da, De, Di, Do, Du; \ + V256 Eba, Ebe, Ebi, Ebo, Ebu; \ + V256 Ega, Ege, Egi, Ego, Egu; \ + V256 Eka, Eke, Eki, Eko, Eku; \ + V256 Ema, Eme, Emi, Emo, Emu; \ + V256 Esa, Ese, Esi, Eso, Esu; -#define prepareTheta \ - Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ - Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ - Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ - Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ - Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \ +#define prepareTheta \ + Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ + Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ + Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ + Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ + Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); /* --- Theta Rho Pi Chi Iota Prepare-theta */ /* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ -\ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \ - Ca = E##ba; \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - Ce = E##be; \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - Ci = E##bi; \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - Co = E##bo; \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ - Cu = E##bu; \ -\ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(Ca, E##ga); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(Ce, E##ge); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - XOReq256(Ci, E##gi); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - XOReq256(Co, E##go); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ - XOReq256(Cu, E##gu); \ -\ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(Ca, E##ka); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(Ce, E##ke); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - XOReq256(Ci, E##ki); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - XOReq256(Co, E##ko); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ - XOReq256(Cu, E##ku); \ -\ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(Ca, E##ma); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(Ce, E##me); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - XOReq256(Ci, E##mi); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - XOReq256(Co, E##mo); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ - XOReq256(Cu, E##mu); \ -\ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(Ca, E##sa); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(Ce, E##se); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - XOReq256(Ci, E##si); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - XOReq256(Co, E##so); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ - XOReq256(Cu, E##su); \ -\ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ + \ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \ + Ca = E##ba; \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + Ce = E##be; \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + Ci = E##bi; \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + Co = E##bo; \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ + Cu = E##bu; \ + \ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(Ca, E##ga); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(Ce, E##ge); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + XOReq256(Ci, E##gi); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + XOReq256(Co, E##go); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ + XOReq256(Cu, E##gu); \ + \ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(Ca, E##ka); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(Ce, E##ke); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + XOReq256(Ci, E##ki); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + XOReq256(Co, E##ko); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ + XOReq256(Cu, E##ku); \ + \ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(Ca, E##ma); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(Ce, E##me); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + XOReq256(Ci, E##mi); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + XOReq256(Co, E##mo); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ + XOReq256(Cu, E##mu); \ + \ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(Ca, E##sa); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(Ce, E##se); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + XOReq256(Ci, E##si); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + XOReq256(Co, E##so); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ + XOReq256(Cu, E##su); /* --- Theta Rho Pi Chi Iota */ /* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ -\ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ -\ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ -\ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ -\ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ -\ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ -\ +#define thetaRhoPiChiIota(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ + \ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ + \ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ + \ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ + \ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ + \ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = { 0x0000000000000001ULL, @@ -353,75 +351,74 @@ static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundCon 0x8000000080008008ULL}; #define copyFromState(X, state) \ - X##ba = LOAD256(state[ 0]); \ - X##be = LOAD256(state[ 1]); \ - X##bi = LOAD256(state[ 2]); \ - X##bo = LOAD256(state[ 3]); \ - X##bu = LOAD256(state[ 4]); \ - X##ga = LOAD256(state[ 5]); \ - X##ge = LOAD256(state[ 6]); \ - X##gi = LOAD256(state[ 7]); \ - X##go = LOAD256(state[ 8]); \ - X##gu = LOAD256(state[ 9]); \ - X##ka = LOAD256(state[10]); \ - X##ke = LOAD256(state[11]); \ - X##ki = LOAD256(state[12]); \ - X##ko = LOAD256(state[13]); \ - X##ku = LOAD256(state[14]); \ - X##ma = LOAD256(state[15]); \ - X##me = LOAD256(state[16]); \ - X##mi = LOAD256(state[17]); \ - X##mo = LOAD256(state[18]); \ - X##mu = LOAD256(state[19]); \ - X##sa = LOAD256(state[20]); \ - X##se = LOAD256(state[21]); \ - X##si = LOAD256(state[22]); \ - X##so = LOAD256(state[23]); \ - X##su = LOAD256(state[24]); \ + X##ba = LOAD256(state[0]); \ + X##be = LOAD256(state[1]); \ + X##bi = LOAD256(state[2]); \ + X##bo = LOAD256(state[3]); \ + X##bu = LOAD256(state[4]); \ + X##ga = LOAD256(state[5]); \ + X##ge = LOAD256(state[6]); \ + X##gi = LOAD256(state[7]); \ + X##go = LOAD256(state[8]); \ + X##gu = LOAD256(state[9]); \ + X##ka = LOAD256(state[10]); \ + X##ke = LOAD256(state[11]); \ + X##ki = LOAD256(state[12]); \ + X##ko = LOAD256(state[13]); \ + X##ku = LOAD256(state[14]); \ + X##ma = LOAD256(state[15]); \ + X##me = LOAD256(state[16]); \ + X##mi = LOAD256(state[17]); \ + X##mo = LOAD256(state[18]); \ + X##mu = LOAD256(state[19]); \ + X##sa = LOAD256(state[20]); \ + X##se = LOAD256(state[21]); \ + X##si = LOAD256(state[22]); \ + X##so = LOAD256(state[23]); \ + X##su = LOAD256(state[24]); -#define copyToState(state, X) \ - STORE256(state[ 0], X##ba); \ - STORE256(state[ 1], X##be); \ - STORE256(state[ 2], X##bi); \ - STORE256(state[ 3], X##bo); \ - STORE256(state[ 4], X##bu); \ - STORE256(state[ 5], X##ga); \ - STORE256(state[ 6], X##ge); \ - STORE256(state[ 7], X##gi); \ - STORE256(state[ 8], X##go); \ - STORE256(state[ 9], X##gu); \ - STORE256(state[10], X##ka); \ - STORE256(state[11], X##ke); \ - STORE256(state[12], X##ki); \ - STORE256(state[13], X##ko); \ - STORE256(state[14], X##ku); \ - STORE256(state[15], X##ma); \ - STORE256(state[16], X##me); \ - STORE256(state[17], X##mi); \ - STORE256(state[18], X##mo); \ - STORE256(state[19], X##mu); \ - STORE256(state[20], X##sa); \ - STORE256(state[21], X##se); \ - STORE256(state[22], X##si); \ - STORE256(state[23], X##so); \ - STORE256(state[24], X##su); \ +#define copyToState(state, X) \ + STORE256(state[0], X##ba); \ + STORE256(state[1], X##be); \ + STORE256(state[2], X##bi); \ + STORE256(state[3], X##bo); \ + STORE256(state[4], X##bu); \ + STORE256(state[5], X##ga); \ + STORE256(state[6], X##ge); \ + STORE256(state[7], X##gi); \ + STORE256(state[8], X##go); \ + STORE256(state[9], X##gu); \ + STORE256(state[10], X##ka); \ + STORE256(state[11], X##ke); \ + STORE256(state[12], X##ki); \ + STORE256(state[13], X##ko); \ + STORE256(state[14], X##ku); \ + STORE256(state[15], X##ma); \ + STORE256(state[16], X##me); \ + STORE256(state[17], X##mi); \ + STORE256(state[18], X##mo); \ + STORE256(state[19], X##mu); \ + STORE256(state[20], X##sa); \ + STORE256(state[21], X##se); \ + STORE256(state[22], X##si); \ + STORE256(state[23], X##so); \ + STORE256(state[24], X##su); - #ifdef KeccakP1600times4_fullUnrolling +#ifdef KeccakP1600times4_fullUnrolling #define FullUnrolling #else #define Unrolling KeccakP1600times4_unrolling #endif #include "KeccakP-1600-unrolling.macros" -static void KeccakP1600times4_PermuteAll_24rounds(void *states) -{ - V256 *statesAsLanes = (V256 *)states; - declareABCDE - #ifndef KeccakP1600times4_fullUnrolling - unsigned int i; - #endif +static void KeccakP1600times4_PermuteAll_24rounds(void *states) { + V256 *statesAsLanes = (V256 *) states; + declareABCDE +#ifndef KeccakP1600times4_fullUnrolling + unsigned int i; +#endif - copyFromState(A, statesAsLanes) - rounds24 - copyToState(statesAsLanes, A) + copyFromState(A, statesAsLanes) + rounds24 + copyToState(statesAsLanes, A) } diff --git a/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SnP.h b/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SnP.h old mode 100755 new mode 100644 index 10850749f..38a8f45ab --- a/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SnP.h +++ b/src/crypto/sha3/keccak4x/KeccakP-1600-times4-SnP.h @@ -23,9 +23,9 @@ http://creativecommons.org/publicdomain/zero/1.0/ #include "SIMD256-config.h" -#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" -#define KeccakP1600times4_statesSizeInBytes 800 -#define KeccakP1600times4_statesAlignment 32 +#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" +#define KeccakP1600times4_statesSizeInBytes 800 +#define KeccakP1600times4_statesAlignment 32 #define KeccakF1600times4_FastLoop_supported #define KeccakP1600times4_12rounds_FastLoop_supported @@ -33,7 +33,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ #define KeccakP1600times4_StaticInitialize() #define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \ - ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte) + ((unsigned char *) (states))[(instanceIndex) *8 + ((offset) / 8) * 4 * 8 + (offset) % 8] ^= (byte) static void KeccakP1600times4_PermuteAll_24rounds(void *states); #endif diff --git a/src/crypto/sha3/keccak4x/align.h b/src/crypto/sha3/keccak4x/align.h old mode 100755 new mode 100644 index e29771ed3..c066850e8 --- a/src/crypto/sha3/keccak4x/align.h +++ b/src/crypto/sha3/keccak4x/align.h @@ -22,7 +22,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ #endif #if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) +#define ALIGN(x) __attribute__((aligned(x))) #elif defined(_MSC_VER) #define ALIGN(x) __declspec(align(x)) #elif defined(__ARMCC_VERSION) diff --git a/src/crypto/sha3/keccak4x/brg_endian.h b/src/crypto/sha3/keccak4x/brg_endian.h old mode 100755 new mode 100644 index 7226eb3be..c02f6ec7e --- a/src/crypto/sha3/keccak4x/brg_endian.h +++ b/src/crypto/sha3/keccak4x/brg_endian.h @@ -29,25 +29,25 @@ #ifndef _BRG_ENDIAN_H #define _BRG_ENDIAN_H -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ #if 0 /* Include files where endian defines and byteswap functions may reside */ -#if defined( __sun ) -# include -#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) -# include -#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ - defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) -# include -#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) -# if !defined( __MINGW32__ ) && !defined( _AIX ) -# include -# if !defined( __BEOS__ ) -# include -# endif -# endif +#if defined(__sun) +#include +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) +#include +#elif defined(BSD) && (BSD >= 199103) || defined(__APPLE__) || \ + defined(__CYGWIN32__) || defined(__DJGPP__) || defined(__osf__) +#include +#elif defined(__linux__) || defined(__GNUC__) || defined(__GNU_LIBRARY__) +#if !defined(__MINGW32__) && !defined(_AIX) +#include +#if !defined(__BEOS__) +#include +#endif +#endif #endif #endif @@ -55,86 +55,86 @@ /* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ /* seem to encompass most endian symbol definitions */ -#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) -# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#if defined(BIG_ENDIAN) && defined(LITTLE_ENDIAN) +#if defined(BYTE_ORDER) && BYTE_ORDER == BIG_ENDIAN +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif +#elif defined(BIG_ENDIAN) +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(LITTLE_ENDIAN) +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN #endif -#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) -# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( _BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( _LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#if defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN) +#if defined(_BYTE_ORDER) && _BYTE_ORDER == _BIG_ENDIAN +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(_BYTE_ORDER) && _BYTE_ORDER == _LITTLE_ENDIAN +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif +#elif defined(_BIG_ENDIAN) +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN #endif -#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) -# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN) +#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif +#elif defined(__BIG_ENDIAN) +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN) +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN #endif -#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) -# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#if defined(__BIG_ENDIAN__) && defined(__LITTLE_ENDIAN__) +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __BIG_ENDIAN__ +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif +#elif defined(__BIG_ENDIAN__) +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN__) +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN #endif /* if the platform byte order could not be determined, then try to */ /* set this define using common machine defines */ #if !defined(PLATFORM_BYTE_ORDER) -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#if defined(__alpha__) || defined(__alpha) || defined(i386) || \ + defined(__i386__) || defined(_M_I86) || defined(_M_IX86) || \ + defined(__OS2__) || defined(sun386) || defined(__TURBOC__) || \ + defined(vax) || defined(vms) || defined(VMS) || \ + defined(__VMS) || defined(_M_X64) +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ - defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(AMIGA) || defined(applec) || defined(__AS400__) || \ + defined(_CRAY) || defined(__hppa) || defined(__hp9000) || \ + defined(ibm370) || defined(mc68000) || defined(m68k) || \ + defined(__MRC__) || defined(__MVS__) || defined(__MWERKS__) || \ + defined(sparc) || defined(__sparc) || defined(SYMANTEC_C) || \ + defined(__VOS__) || defined(__TIGCC__) || defined(__TANDEM) || \ + defined(THINK_C) || defined(__VMCMS__) || defined(_AIX) +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN #elif defined(__arm__) -# ifdef __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# else -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif 1 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#ifdef __BIG_ENDIAN +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN #else -# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif +#elif 1 /* **** EDIT HERE IF NECESSARY **** */ +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +#error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order #endif #endif diff --git a/src/crypto/sha3/sha3x4.c b/src/crypto/sha3/sha3x4.c old mode 100755 new mode 100644 index 82abc3abe..f3c2cc669 --- a/src/crypto/sha3/sha3x4.c +++ b/src/crypto/sha3/sha3x4.c @@ -5,166 +5,144 @@ #define SHAKE128_RATE 168 #define NROUNDS 24 -#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) +#define ROL(a, offset) ((a << offset) ^ (a >> (64 - offset))) +static uint64_t load64(const unsigned char *x) { + unsigned long long r = 0, i; -static uint64_t load64(const unsigned char *x) -{ - unsigned long long r = 0, i; - - for (i = 0; i < 8; ++i) { - r |= (unsigned long long)x[i] << 8 * i; - } - return r; + for (i = 0; i < 8; ++i) { + r |= (unsigned long long) x[i] << 8 * i; + } + return r; } +static void store64(uint8_t *x, uint64_t u) { + unsigned int i; -static void store64(uint8_t *x, uint64_t u) -{ - unsigned int i; - - for (i = 0; i < 8; ++i) { - x[i] = u; - u >>= 8; - } + for (i = 0; i < 8; ++i) { + x[i] = u; + u >>= 8; + } } - /* Use implementation from the Keccak Code Package */ #include "keccak4x/KeccakP-1600-times4-SIMD256.c" #define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds +static void keccak_absorb4x(__m256i *s, unsigned int r, const unsigned char *m0, const unsigned char *m1, const unsigned char *m2, const unsigned char *m3, + unsigned long long int mlen, unsigned char p) { + unsigned long long i; + unsigned char t0[200]; + unsigned char t1[200]; + unsigned char t2[200]; + unsigned char t3[200]; + unsigned long long *ss = (unsigned long long *) s; -static void keccak_absorb4x(__m256i *s, unsigned int r, const unsigned char *m0, const unsigned char *m1, const unsigned char *m2, const unsigned char *m3, - unsigned long long int mlen, unsigned char p) -{ - unsigned long long i; - unsigned char t0[200]; - unsigned char t1[200]; - unsigned char t2[200]; - unsigned char t3[200]; - unsigned long long *ss = (unsigned long long *)s; - - while (mlen >= r) - { - for (i = 0; i < r / 8; ++i) - { - ss[4*i+0] ^= load64(m0 + 8 * i); - ss[4*i+1] ^= load64(m1 + 8 * i); - ss[4*i+2] ^= load64(m2 + 8 * i); - ss[4*i+3] ^= load64(m3 + 8 * i); - } - - KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; - } + while (mlen >= r) { + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(m0 + 8 * i); + ss[4 * i + 1] ^= load64(m1 + 8 * i); + ss[4 * i + 2] ^= load64(m2 + 8 * i); + ss[4 * i + 3] ^= load64(m3 + 8 * i); + } - for (i = 0; i < r; ++i) - { - t0[i] = 0; - t1[i] = 0; - t2[i] = 0; - t3[i] = 0; - } - for (i = 0; i < mlen; ++i) - { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; - } + KeccakF1600_StatePermute4x(s); + mlen -= r; + m0 += r; + m1 += r; + m2 += r; + m3 += r; + } - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; + for (i = 0; i < r; ++i) { + t0[i] = 0; + t1[i] = 0; + t2[i] = 0; + t3[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t0[i] = m0[i]; + t1[i] = m1[i]; + t2[i] = m2[i]; + t3[i] = m3[i]; + } - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; + t0[i] = p; + t1[i] = p; + t2[i] = p; + t3[i] = p; - for (i = 0; i < r / 8; ++i) - { - ss[4*i+0] ^= load64(t0 + 8 * i); - ss[4*i+1] ^= load64(t1 + 8 * i); - ss[4*i+2] ^= load64(t2 + 8 * i); - ss[4*i+3] ^= load64(t3 + 8 * i); - } + t0[r - 1] |= 128; + t1[r - 1] |= 128; + t2[r - 1] |= 128; + t3[r - 1] |= 128; + + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(t0 + 8 * i); + ss[4 * i + 1] ^= load64(t1 + 8 * i); + ss[4 * i + 2] ^= load64(t2 + 8 * i); + ss[4 * i + 3] ^= load64(t3 + 8 * i); + } } +static void keccak_squeezeblocks4x(unsigned char *h0, unsigned char *h1, unsigned char *h2, unsigned char *h3, unsigned long long int nblocks, __m256i *s, unsigned int r) { + unsigned int i; + unsigned long long *ss = (unsigned long long *) s; -static void keccak_squeezeblocks4x(unsigned char *h0, unsigned char *h1, unsigned char *h2, unsigned char *h3, unsigned long long int nblocks, __m256i *s, unsigned int r) -{ - unsigned int i; - unsigned long long *ss = (unsigned long long *)s; - - while (nblocks > 0) - { - KeccakF1600_StatePermute4x(s); - for (i = 0; i < (r>>3); i++) - { - store64(h0+8*i, ss[4*i+0]); - store64(h1+8*i, ss[4*i+1]); - store64(h2+8*i, ss[4*i+2]); - store64(h3+8*i, ss[4*i+3]); - } - h0 += r; - h1 += r; - h2 += r; - h3 += r; - nblocks--; - } + while (nblocks > 0) { + KeccakF1600_StatePermute4x(s); + for (i = 0; i < (r >> 3); i++) { + store64(h0 + 8 * i, ss[4 * i + 0]); + store64(h1 + 8 * i, ss[4 * i + 1]); + store64(h2 + 8 * i, ss[4 * i + 2]); + store64(h3 + 8 * i, ss[4 * i + 3]); + } + h0 += r; + h1 += r; + h2 += r; + h3 += r; + nblocks--; + } } - /********** SHAKE128 ***********/ -static void shake128_absorb4x(__m256i *s, const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen) -{ - unsigned int i; +static void shake128_absorb4x(__m256i *s, const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen) { + unsigned int i; - for (i = 0; i < 25; i++) - s[i] = _mm256_xor_si256(s[i], s[i]); // zero state + for (i = 0; i < 25; i++) + s[i] = _mm256_xor_si256(s[i], s[i]); // zero state - /* Absorb input */ - keccak_absorb4x(s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); + /* Absorb input */ + keccak_absorb4x(s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } - /* N is assumed to be empty; S is assumed to have at most 2 characters */ -void OQS_SHA3_shake128_4x(unsigned char *output0, unsigned char *output1, unsigned char *output2, unsigned char *output3, unsigned long long outlen, - const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen) -{ - __m256i s[25]; - unsigned char t0[SHAKE128_RATE]; - unsigned char t1[SHAKE128_RATE]; - unsigned char t2[SHAKE128_RATE]; - unsigned char t3[SHAKE128_RATE]; - unsigned int i; +void OQS_SHA3_shake128_4x(unsigned char *output0, unsigned char *output1, unsigned char *output2, unsigned char *output3, unsigned long long outlen, + const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen) { + __m256i s[25]; + unsigned char t0[SHAKE128_RATE]; + unsigned char t1[SHAKE128_RATE]; + unsigned char t2[SHAKE128_RATE]; + unsigned char t3[SHAKE128_RATE]; + unsigned int i; - shake128_absorb4x(s, in0, in1, in2, in3, inlen); + shake128_absorb4x(s, in0, in1, in2, in3, inlen); - /* Squeeze output */ - keccak_squeezeblocks4x(output0, output1, output2, output3, outlen/SHAKE128_RATE, s, SHAKE128_RATE); - output0 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; - output1 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; - output2 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; - output3 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; + /* Squeeze output */ + keccak_squeezeblocks4x(output0, output1, output2, output3, outlen / SHAKE128_RATE, s, SHAKE128_RATE); + output0 += (outlen / SHAKE128_RATE) * SHAKE128_RATE; + output1 += (outlen / SHAKE128_RATE) * SHAKE128_RATE; + output2 += (outlen / SHAKE128_RATE) * SHAKE128_RATE; + output3 += (outlen / SHAKE128_RATE) * SHAKE128_RATE; - if (outlen%SHAKE128_RATE) - { - keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE128_RATE); - for (i = 0; i < outlen%SHAKE128_RATE; i++) - { - output0[i] = t0[i]; - output1[i] = t1[i]; - output2[i] = t2[i]; - output3[i] = t3[i]; - } - } + if (outlen % SHAKE128_RATE) { + keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE128_RATE); + for (i = 0; i < outlen % SHAKE128_RATE; i++) { + output0[i] = t0[i]; + output1[i] = t1[i]; + output2[i] = t2[i]; + output3[i] = t3[i]; + } + } }