Prettyprint

This commit is contained in:
Douglas Stebila 2019-07-08 21:10:49 -04:00
parent cdcedc6ff3
commit decb6b198c
5 changed files with 524 additions and 549 deletions

155
src/crypto/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c Executable file → Normal file
View File

@ -36,47 +36,47 @@ typedef unsigned long long int UINT64;
typedef __m128i V128;
typedef __m256i V256;
#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
#define laneIndex(instanceIndex, lanePosition) ((lanePosition) *4 + instanceIndex)
#if defined(KeccakP1600times4_useAVX2)
#define ANDnu256(a, b) _mm256_andnot_si256(a, b)
#define CONST256(a) _mm256_load_si256((const V256 *)&(a))
#define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
#define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
#define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
#define ANDnu256(a, b) _mm256_andnot_si256(a, b)
#define CONST256(a) _mm256_load_si256((const V256 *) &(a))
#define CONST256_64(a) (V256) _mm256_broadcast_sd((const double *) (&a))
#define LOAD256(a) _mm256_load_si256((const V256 *) &(a))
#define LOAD256u(a) _mm256_loadu_si256((const V256 *) &(a))
#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64 - (o)))
#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
#define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
#define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
#define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
#define XOR256(a, b) _mm256_xor_si256(a, b)
#define XOReq256(a, b) a = _mm256_xor_si256(a, b)
#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
#define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
#define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
#define STORE256(a, b) _mm256_store_si256((V256 *) &(a), b)
#define STORE256u(a, b) _mm256_storeu_si256((V256 *) &(a), b)
#define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128 *) &(ah), (V128 *) &(al), v)
#define XOR256(a, b) _mm256_xor_si256(a, b)
#define XOReq256(a, b) a = _mm256_xor_si256(a, b)
#define UNPACKL(a, b) _mm256_unpacklo_epi64((a), (b))
#define UNPACKH(a, b) _mm256_unpackhi_epi64((a), (b))
#define PERM128(a, b, c) (V256) _mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
#define SHUFFLE64(a, b, c) (V256) _mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
#define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
lanesH01 = UNPACKH( lanes0, lanes1 ), \
lanesL23 = UNPACKL( lanes2, lanes3 ), \
lanesH23 = UNPACKH( lanes2, lanes3 ), \
lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
#define UNINTLEAVE() lanesL01 = UNPACKL(lanes0, lanes1), \
lanesH01 = UNPACKH(lanes0, lanes1), \
lanesL23 = UNPACKL(lanes2, lanes3), \
lanesH23 = UNPACKH(lanes2, lanes3), \
lanes0 = PERM128(lanesL01, lanesL23, 0x20), \
lanes2 = PERM128(lanesL01, lanesL23, 0x31), \
lanes1 = PERM128(lanesH01, lanesH23, 0x20), \
lanes3 = PERM128(lanesH01, lanesH23, 0x31)
#define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
#define INTLEAVE() lanesL01 = PERM128(lanes0, lanes2, 0x20), \
lanesH01 = PERM128(lanes1, lanes3, 0x20), \
lanesL23 = PERM128(lanes0, lanes2, 0x31), \
lanesH23 = PERM128(lanes1, lanes3, 0x31), \
lanes0 = SHUFFLE64(lanesL01, lanesH01, 0x00), \
lanes1 = SHUFFLE64(lanesL01, lanesH01, 0x0F), \
lanes2 = SHUFFLE64(lanesL23, lanesH23, 0x00), \
lanes3 = SHUFFLE64(lanesL23, lanesH23, 0x0F)
#endif
@ -100,14 +100,14 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
V256 Ega, Ege, Egi, Ego, Egu; \
V256 Eka, Eke, Eki, Eko, Eku; \
V256 Ema, Eme, Emi, Emo, Emu; \
V256 Esa, Ese, Esi, Eso, Esu; \
V256 Esa, Ese, Esi, Eso, Esu;
#define prepareTheta \
Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu))));
/* --- Theta Rho Pi Chi Iota Prepare-theta */
/* --- 64-bit lanes mapped to 64-bit words */
@ -122,7 +122,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
Do = XOR256(Ci, Cu1); \
ROL64in256(Ca1, Ca, 1); \
Du = XOR256(Co, Ca1); \
\
\
XOReq256(A##ba, Da); \
Bba = A##ba; \
XOReq256(A##ge, De); \
@ -144,7 +144,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
Co = E##bo; \
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
Cu = E##bu; \
\
\
XOReq256(A##bo, Do); \
ROL64in256(Bga, A##bo, 28); \
XOReq256(A##gu, Du); \
@ -165,7 +165,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
XOReq256(Co, E##go); \
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
XOReq256(Cu, E##gu); \
\
\
XOReq256(A##be, De); \
ROL64in256(Bka, A##be, 1); \
XOReq256(A##gi, Di); \
@ -186,7 +186,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
XOReq256(Co, E##ko); \
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
XOReq256(Cu, E##ku); \
\
\
XOReq256(A##bu, Du); \
ROL64in256(Bma, A##bu, 27); \
XOReq256(A##ga, Da); \
@ -207,7 +207,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
XOReq256(Co, E##mo); \
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
XOReq256(Cu, E##mu); \
\
\
XOReq256(A##bi, Di); \
ROL64in256(Bsa, A##bi, 62); \
XOReq256(A##go, Do); \
@ -227,8 +227,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
XOReq256(Co, E##so); \
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
XOReq256(Cu, E##su); \
\
XOReq256(Cu, E##su);
/* --- Theta Rho Pi Chi Iota */
/* --- 64-bit lanes mapped to 64-bit words */
@ -243,7 +242,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
Do = XOR256(Ci, Cu1); \
ROL64in256(Ca1, Ca, 1); \
Du = XOR256(Co, Ca1); \
\
\
XOReq256(A##ba, Da); \
Bba = A##ba; \
XOReq256(A##ge, De); \
@ -260,7 +259,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
\
\
XOReq256(A##bo, Do); \
ROL64in256(Bga, A##bo, 28); \
XOReq256(A##gu, Du); \
@ -276,7 +275,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
\
\
XOReq256(A##be, De); \
ROL64in256(Bka, A##be, 1); \
XOReq256(A##gi, Di); \
@ -292,7 +291,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
\
\
XOReq256(A##bu, Du); \
ROL64in256(Bma, A##bu, 27); \
XOReq256(A##ga, Da); \
@ -308,7 +307,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
\
\
XOReq256(A##bi, Di); \
ROL64in256(Bsa, A##bi, 62); \
XOReq256(A##go, Do); \
@ -323,8 +322,7 @@ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x101716
ROL64in256(Bsu, A##se, 2); \
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
\
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse));
static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
0x0000000000000001ULL,
@ -353,16 +351,16 @@ static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundCon
0x8000000080008008ULL};
#define copyFromState(X, state) \
X##ba = LOAD256(state[ 0]); \
X##be = LOAD256(state[ 1]); \
X##bi = LOAD256(state[ 2]); \
X##bo = LOAD256(state[ 3]); \
X##bu = LOAD256(state[ 4]); \
X##ga = LOAD256(state[ 5]); \
X##ge = LOAD256(state[ 6]); \
X##gi = LOAD256(state[ 7]); \
X##go = LOAD256(state[ 8]); \
X##gu = LOAD256(state[ 9]); \
X##ba = LOAD256(state[0]); \
X##be = LOAD256(state[1]); \
X##bi = LOAD256(state[2]); \
X##bo = LOAD256(state[3]); \
X##bu = LOAD256(state[4]); \
X##ga = LOAD256(state[5]); \
X##ge = LOAD256(state[6]); \
X##gi = LOAD256(state[7]); \
X##go = LOAD256(state[8]); \
X##gu = LOAD256(state[9]); \
X##ka = LOAD256(state[10]); \
X##ke = LOAD256(state[11]); \
X##ki = LOAD256(state[12]); \
@ -377,19 +375,19 @@ static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundCon
X##se = LOAD256(state[21]); \
X##si = LOAD256(state[22]); \
X##so = LOAD256(state[23]); \
X##su = LOAD256(state[24]); \
X##su = LOAD256(state[24]);
#define copyToState(state, X) \
STORE256(state[ 0], X##ba); \
STORE256(state[ 1], X##be); \
STORE256(state[ 2], X##bi); \
STORE256(state[ 3], X##bo); \
STORE256(state[ 4], X##bu); \
STORE256(state[ 5], X##ga); \
STORE256(state[ 6], X##ge); \
STORE256(state[ 7], X##gi); \
STORE256(state[ 8], X##go); \
STORE256(state[ 9], X##gu); \
STORE256(state[0], X##ba); \
STORE256(state[1], X##be); \
STORE256(state[2], X##bi); \
STORE256(state[3], X##bo); \
STORE256(state[4], X##bu); \
STORE256(state[5], X##ga); \
STORE256(state[6], X##ge); \
STORE256(state[7], X##gi); \
STORE256(state[8], X##go); \
STORE256(state[9], X##gu); \
STORE256(state[10], X##ka); \
STORE256(state[11], X##ke); \
STORE256(state[12], X##ki); \
@ -404,22 +402,21 @@ static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundCon
STORE256(state[21], X##se); \
STORE256(state[22], X##si); \
STORE256(state[23], X##so); \
STORE256(state[24], X##su); \
STORE256(state[24], X##su);
#ifdef KeccakP1600times4_fullUnrolling
#ifdef KeccakP1600times4_fullUnrolling
#define FullUnrolling
#else
#define Unrolling KeccakP1600times4_unrolling
#endif
#include "KeccakP-1600-unrolling.macros"
static void KeccakP1600times4_PermuteAll_24rounds(void *states)
{
V256 *statesAsLanes = (V256 *)states;
static void KeccakP1600times4_PermuteAll_24rounds(void *states) {
V256 *statesAsLanes = (V256 *) states;
declareABCDE
#ifndef KeccakP1600times4_fullUnrolling
#ifndef KeccakP1600times4_fullUnrolling
unsigned int i;
#endif
#endif
copyFromState(A, statesAsLanes)
rounds24

2
src/crypto/sha3/keccak4x/KeccakP-1600-times4-SnP.h Executable file → Normal file
View File

@ -33,7 +33,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
#define KeccakP1600times4_StaticInitialize()
#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
((unsigned char *) (states))[(instanceIndex) *8 + ((offset) / 8) * 4 * 8 + (offset) % 8] ^= (byte)
static void KeccakP1600times4_PermuteAll_24rounds(void *states);
#endif

2
src/crypto/sha3/keccak4x/align.h Executable file → Normal file
View File

@ -22,7 +22,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
#endif
#if defined(__GNUC__)
#define ALIGN(x) __attribute__ ((aligned(x)))
#define ALIGN(x) __attribute__((aligned(x)))
#elif defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#elif defined(__ARMCC_VERSION)

156
src/crypto/sha3/keccak4x/brg_endian.h Executable file → Normal file
View File

@ -34,20 +34,20 @@
#if 0
/* Include files where endian defines and byteswap functions may reside */
#if defined( __sun )
# include <sys/isa_defs.h>
#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
# include <sys/endian.h>
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
# include <machine/endian.h>
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
# if !defined( __MINGW32__ ) && !defined( _AIX )
# include <endian.h>
# if !defined( __BEOS__ )
# include <byteswap.h>
# endif
# endif
#if defined(__sun)
#include <sys/isa_defs.h>
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
#include <sys/endian.h>
#elif defined(BSD) && (BSD >= 199103) || defined(__APPLE__) || \
defined(__CYGWIN32__) || defined(__DJGPP__) || defined(__osf__)
#include <machine/endian.h>
#elif defined(__linux__) || defined(__GNUC__) || defined(__GNU_LIBRARY__)
#if !defined(__MINGW32__) && !defined(_AIX)
#include <endian.h>
#if !defined(__BEOS__)
#include <byteswap.h>
#endif
#endif
#endif
#endif
@ -55,86 +55,86 @@
/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
/* seem to encompass most endian symbol definitions */
#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#if defined(BIG_ENDIAN) && defined(LITTLE_ENDIAN)
#if defined(BYTE_ORDER) && BYTE_ORDER == BIG_ENDIAN
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#elif defined(BIG_ENDIAN)
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(LITTLE_ENDIAN)
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( _BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( _LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#if defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN)
#if defined(_BYTE_ORDER) && _BYTE_ORDER == _BIG_ENDIAN
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(_BYTE_ORDER) && _BYTE_ORDER == _LITTLE_ENDIAN
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#elif defined(_BIG_ENDIAN)
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(_LITTLE_ENDIAN)
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN)
#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#elif defined(__BIG_ENDIAN)
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(__LITTLE_ENDIAN)
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#if defined(__BIG_ENDIAN__) && defined(__LITTLE_ENDIAN__)
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __BIG_ENDIAN__
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#elif defined(__BIG_ENDIAN__)
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(__LITTLE_ENDIAN__)
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
/* if the platform byte order could not be determined, then try to */
/* set this define using common machine defines */
#if !defined(PLATFORM_BYTE_ORDER)
#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \
defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \
defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \
defined( vax ) || defined( vms ) || defined( VMS ) || \
defined( __VMS ) || defined( _M_X64 )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#if defined(__alpha__) || defined(__alpha) || defined(i386) || \
defined(__i386__) || defined(_M_I86) || defined(_M_IX86) || \
defined(__OS2__) || defined(sun386) || defined(__TURBOC__) || \
defined(vax) || defined(vms) || defined(VMS) || \
defined(__VMS) || defined(_M_X64)
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \
defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \
defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \
defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \
defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \
defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \
defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(AMIGA) || defined(applec) || defined(__AS400__) || \
defined(_CRAY) || defined(__hppa) || defined(__hp9000) || \
defined(ibm370) || defined(mc68000) || defined(m68k) || \
defined(__MRC__) || defined(__MVS__) || defined(__MWERKS__) || \
defined(sparc) || defined(__sparc) || defined(SYMANTEC_C) || \
defined(__VOS__) || defined(__TIGCC__) || defined(__TANDEM) || \
defined(THINK_C) || defined(__VMCMS__) || defined(_AIX)
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(__arm__)
# ifdef __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# else
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif 1 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#ifdef __BIG_ENDIAN
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#else
# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#elif 1 /* **** EDIT HERE IF NECESSARY **** */
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#else
#error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
#endif
#endif

94
src/crypto/sha3/sha3x4.c Executable file → Normal file
View File

@ -5,22 +5,18 @@
#define SHAKE128_RATE 168
#define NROUNDS 24
#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
#define ROL(a, offset) ((a << offset) ^ (a >> (64 - offset)))
static uint64_t load64(const unsigned char *x)
{
static uint64_t load64(const unsigned char *x) {
unsigned long long r = 0, i;
for (i = 0; i < 8; ++i) {
r |= (unsigned long long)x[i] << 8 * i;
r |= (unsigned long long) x[i] << 8 * i;
}
return r;
}
static void store64(uint8_t *x, uint64_t u)
{
static void store64(uint8_t *x, uint64_t u) {
unsigned int i;
for (i = 0; i < 8; ++i) {
@ -29,30 +25,25 @@ static void store64(uint8_t *x, uint64_t u)
}
}
/* Use implementation from the Keccak Code Package */
#include "keccak4x/KeccakP-1600-times4-SIMD256.c"
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
static void keccak_absorb4x(__m256i *s, unsigned int r, const unsigned char *m0, const unsigned char *m1, const unsigned char *m2, const unsigned char *m3,
unsigned long long int mlen, unsigned char p)
{
unsigned long long int mlen, unsigned char p) {
unsigned long long i;
unsigned char t0[200];
unsigned char t1[200];
unsigned char t2[200];
unsigned char t3[200];
unsigned long long *ss = (unsigned long long *)s;
unsigned long long *ss = (unsigned long long *) s;
while (mlen >= r)
{
for (i = 0; i < r / 8; ++i)
{
ss[4*i+0] ^= load64(m0 + 8 * i);
ss[4*i+1] ^= load64(m1 + 8 * i);
ss[4*i+2] ^= load64(m2 + 8 * i);
ss[4*i+3] ^= load64(m3 + 8 * i);
while (mlen >= r) {
for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(m0 + 8 * i);
ss[4 * i + 1] ^= load64(m1 + 8 * i);
ss[4 * i + 2] ^= load64(m2 + 8 * i);
ss[4 * i + 3] ^= load64(m3 + 8 * i);
}
KeccakF1600_StatePermute4x(s);
@ -63,15 +54,13 @@ static void keccak_absorb4x(__m256i *s, unsigned int r, const unsigned char *m0,
m3 += r;
}
for (i = 0; i < r; ++i)
{
for (i = 0; i < r; ++i) {
t0[i] = 0;
t1[i] = 0;
t2[i] = 0;
t3[i] = 0;
}
for (i = 0; i < mlen; ++i)
{
for (i = 0; i < mlen; ++i) {
t0[i] = m0[i];
t1[i] = m1[i];
t2[i] = m2[i];
@ -88,30 +77,25 @@ static void keccak_absorb4x(__m256i *s, unsigned int r, const unsigned char *m0,
t2[r - 1] |= 128;
t3[r - 1] |= 128;
for (i = 0; i < r / 8; ++i)
{
ss[4*i+0] ^= load64(t0 + 8 * i);
ss[4*i+1] ^= load64(t1 + 8 * i);
ss[4*i+2] ^= load64(t2 + 8 * i);
ss[4*i+3] ^= load64(t3 + 8 * i);
for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(t0 + 8 * i);
ss[4 * i + 1] ^= load64(t1 + 8 * i);
ss[4 * i + 2] ^= load64(t2 + 8 * i);
ss[4 * i + 3] ^= load64(t3 + 8 * i);
}
}
static void keccak_squeezeblocks4x(unsigned char *h0, unsigned char *h1, unsigned char *h2, unsigned char *h3, unsigned long long int nblocks, __m256i *s, unsigned int r)
{
static void keccak_squeezeblocks4x(unsigned char *h0, unsigned char *h1, unsigned char *h2, unsigned char *h3, unsigned long long int nblocks, __m256i *s, unsigned int r) {
unsigned int i;
unsigned long long *ss = (unsigned long long *)s;
unsigned long long *ss = (unsigned long long *) s;
while (nblocks > 0)
{
while (nblocks > 0) {
KeccakF1600_StatePermute4x(s);
for (i = 0; i < (r>>3); i++)
{
store64(h0+8*i, ss[4*i+0]);
store64(h1+8*i, ss[4*i+1]);
store64(h2+8*i, ss[4*i+2]);
store64(h3+8*i, ss[4*i+3]);
for (i = 0; i < (r >> 3); i++) {
store64(h0 + 8 * i, ss[4 * i + 0]);
store64(h1 + 8 * i, ss[4 * i + 1]);
store64(h2 + 8 * i, ss[4 * i + 2]);
store64(h3 + 8 * i, ss[4 * i + 3]);
}
h0 += r;
h1 += r;
@ -121,11 +105,9 @@ static void keccak_squeezeblocks4x(unsigned char *h0, unsigned char *h1, unsigne
}
}
/********** SHAKE128 ***********/
static void shake128_absorb4x(__m256i *s, const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen)
{
static void shake128_absorb4x(__m256i *s, const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen) {
unsigned int i;
for (i = 0; i < 25; i++)
@ -135,11 +117,9 @@ static void shake128_absorb4x(__m256i *s, const unsigned char *in0, const unsign
keccak_absorb4x(s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}
/* N is assumed to be empty; S is assumed to have at most 2 characters */
void OQS_SHA3_shake128_4x(unsigned char *output0, unsigned char *output1, unsigned char *output2, unsigned char *output3, unsigned long long outlen,
const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen)
{
const unsigned char *in0, const unsigned char *in1, const unsigned char *in2, const unsigned char *in3, unsigned long long inlen) {
__m256i s[25];
unsigned char t0[SHAKE128_RATE];
unsigned char t1[SHAKE128_RATE];
@ -150,17 +130,15 @@ void OQS_SHA3_shake128_4x(unsigned char *output0, unsigned char *output1, unsign
shake128_absorb4x(s, in0, in1, in2, in3, inlen);
/* Squeeze output */
keccak_squeezeblocks4x(output0, output1, output2, output3, outlen/SHAKE128_RATE, s, SHAKE128_RATE);
output0 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
output1 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
output2 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
output3 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
keccak_squeezeblocks4x(output0, output1, output2, output3, outlen / SHAKE128_RATE, s, SHAKE128_RATE);
output0 += (outlen / SHAKE128_RATE) * SHAKE128_RATE;
output1 += (outlen / SHAKE128_RATE) * SHAKE128_RATE;
output2 += (outlen / SHAKE128_RATE) * SHAKE128_RATE;
output3 += (outlen / SHAKE128_RATE) * SHAKE128_RATE;
if (outlen%SHAKE128_RATE)
{
if (outlen % SHAKE128_RATE) {
keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE128_RATE);
for (i = 0; i < outlen%SHAKE128_RATE; i++)
{
for (i = 0; i < outlen % SHAKE128_RATE; i++) {
output0[i] = t0[i];
output1[i] = t1[i];
output2[i] = t2[i];