diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 24ba70e52..99fd8243f 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -21,6 +21,12 @@ cmake_dependent_option(OQS_USE_SHA2_OPENSSL "" ON "OQS_USE_OPENSSL" OFF) # enough to support our incremental API. cmake_dependent_option(OQS_USE_SHA3_OPENSSL "" OFF "OQS_USE_OPENSSL" OFF) +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_DIST_X86_64_BUILD OR OQS_USE_AVX2_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_SHA3_xkcp_low_avx2 "" ON "NOT OQS_USE_SHA3_OPENSSL" OFF) +endif() +endif() + # BIKE is not supported on Windows cmake_dependent_option(OQS_ENABLE_KEM_BIKE "" ON "NOT WIN32" OFF) cmake_dependent_option(OQS_ENABLE_KEM_bike1_l1_cpa "" ON "OQS_ENABLE_KEM_BIKE" OFF) diff --git a/scripts/copy_from_xkcp/CMakeLists.txt b/scripts/copy_from_xkcp/CMakeLists.txt new file mode 100644 index 000000000..06125e0a9 --- /dev/null +++ b/scripts/copy_from_xkcp/CMakeLists.txt @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: MIT + +set(_XKCP_LOW_OBJS "") + +# Determine which of the implementations we're building +if(OQS_DIST_X86_64_BUILD) + set(BUILD_PLAIN64 ON) + set(BUILD_AVX2 ON) +elseif(OQS_USE_AVX2_INSTRUCTIONS) + set(BUILD_AVX2 ON) +else() + set(BUILD_PLAIN64 ON) +endif() + +if(BUILD_PLAIN64) + add_library(xkcp_low_keccakp_1600_plain64 OBJECT KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c) + + add_library(xkcp_low_keccakp_1600times4_serial OBJECT KeccakP-1600times4/serial/KeccakP-1600-times4-on1.c) + target_include_directories(xkcp_low_keccakp_1600times4_serial PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/plain-64bits) + + if(OQS_DIST_X86_64_BUILD) + target_compile_definitions(xkcp_low_keccakp_1600_plain64 PRIVATE ADD_SYMBOL_SUFFIX) + target_compile_definitions(xkcp_low_keccakp_1600times4_serial PRIVATE ADD_SYMBOL_SUFFIX) + endif() + + set(_XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} $ + $) +endif() + +# We currently do not have a SHA3 AVX2 implementation that works on Windows +if(BUILD_AVX2 AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") + add_library(xkcp_low_keccakp_1600_avx2 OBJECT KeccakP-1600/avx2/KeccakP-1600-AVX2.S) + + add_library(xkcp_low_keccakp_1600times4_avx2 OBJECT KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c) + target_include_directories(xkcp_low_keccakp_1600times4_avx2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/avx2) + target_compile_options(xkcp_low_keccakp_1600times4_avx2 PRIVATE -mavx2) + + if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + target_compile_options(xkcp_low_keccakp_1600_avx2 PRIVATE -Wa,-defsym,macOS=1) + endif() + + if(OQS_DIST_X86_64_BUILD) + target_compile_definitions(xkcp_low_keccakp_1600_avx2 PRIVATE ADD_SYMBOL_SUFFIX) + target_compile_definitions(xkcp_low_keccakp_1600times4_avx2 PRIVATE ADD_SYMBOL_SUFFIX) + endif() + + set(_XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} $ + $) +endif() + +set(XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} PARENT_SCOPE) diff --git a/scripts/copy_from_xkcp/package.sh b/scripts/copy_from_xkcp/package.sh index 5a868bcc3..8fff06e31 100755 --- a/scripts/copy_from_xkcp/package.sh +++ b/scripts/copy_from_xkcp/package.sh @@ -58,8 +58,8 @@ cp -Lp "${BUILD_UPSTREAM}/lib/common/brg_endian.h" \ OUT="${BUILD_XKCP}/KeccakP-1600/avx2" mkdir -p "${OUT}" +cp -Lp "${BUILD_UPSTREAM}/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s" "${OUT}/KeccakP-1600-AVX2.S" cp -Lp "${BUILD_UPSTREAM}/lib/common/align.h" \ - "${BUILD_UPSTREAM}/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s" \ "${BUILD_UPSTREAM}/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h" \ "${OUT}" @@ -92,39 +92,7 @@ unifdef -k -m \ -DKeccakP1600times4_useAVX2 \ {} \; -cat << EOF > "${BUILD_XKCP}/CMakeLists.txt" -# SPDX-License-Identifier: MIT - -set(_XKCP_LOW_OBJS "") -set(_XKCP_LOW_INCLUDE_DIRS "") - -if(OQS_USE_AVX2_INSTRUCTIONS AND NOT OQS_DIST_BUILD) - add_library(xkcp_low_keccakp_1600_avx2 OBJECT KeccakP-1600/avx2/KeccakP-1600-AVX2.s) - target_compile_options(xkcp_low_keccakp_1600_avx2 PRIVATE -mavx2) - target_include_directories(xkcp_low_keccakp_1600_avx2 PRIVATE \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/avx2) - set(_XKCP_LOW_OBJS \${_XKCP_LOW_OBJS} \$) - set(_XKCP_LOW_INCLUDE_DIRS \${_XKCP_LOW_INCLUDE_DIRS} \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/avx2) - - add_library(xkcp_low_keccakp_1600times4_avx2 OBJECT KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c) - target_compile_options(xkcp_low_keccakp_1600times4_avx2 PRIVATE -mavx2) - target_include_directories(xkcp_low_keccakp_1600times4_avx2 PRIVATE \${_XKCP_LOW_INCLUDE_DIRS} \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/avx2) - set(_XKCP_LOW_OBJS \${_XKCP_LOW_OBJS} \$) - set(_XKCP_LOW_INCLUDE_DIRS \${_XKCP_LOW_INCLUDE_DIRS} \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/avx2) -else() - add_library(xkcp_low_keccakp_1600_plain64 OBJECT KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c) - target_include_directories(xkcp_low_keccakp_1600_plain64 PRIVATE \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/plain-64bits) - set(_XKCP_LOW_OBJS \${_XKCP_LOW_OBJS} \$) - set(_XKCP_LOW_INCLUDE_DIRS \${_XKCP_LOW_INCLUDE_DIRS} \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/plain-64bits) - - add_library(xkcp_low_keccakp_1600times4_plain64 OBJECT KeccakP-1600times4/serial/KeccakP-1600-times4-on1.c) - target_include_directories(xkcp_low_keccakp_1600times4_plain64 PRIVATE \${_XKCP_LOW_INCLUDE_DIRS} \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/serial) - set(_XKCP_LOW_OBJS \${_XKCP_LOW_OBJS} \$) - set(_XKCP_LOW_INCLUDE_DIRS \${_XKCP_LOW_INCLUDE_DIRS} \${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/serial) -endif() - -set(XKCP_LOW_OBJS \${_XKCP_LOW_OBJS} PARENT_SCOPE) -set(XKCP_LOW_INCLUDE_DIRS \${_XKCP_LOW_INCLUDE_DIRS} PARENT_SCOPE) -EOF +cp "${BASE}/CMakeLists.txt" "${BUILD_XKCP}/CMakeLists.txt" # Apply liboqs formatting astyle \ diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SIMD256.c b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SIMD256.c index afd588944..0245b0a1d 100644 --- a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SIMD256.c +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SIMD256.c @@ -23,7 +23,7 @@ #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b) #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b) #define STORE2_128(ah, al, v) _mm256_storeu2_m128i(&(ah), &(al), v) -@@ -87,6 +86,23 @@ +@@ -87,9 +86,30 @@ #define SnP_laneLengthInBytes 8 @@ -46,8 +46,16 @@ + void KeccakP1600times4_InitializeAll(void *states) { - memset(states, 0, KeccakP1600times4_statesSizeInBytes); -@@ -97,14 +113,16 @@ +- memset(states, 0, KeccakP1600times4_statesSizeInBytes); ++ memset(states, 0, KeccakP1600times4_statesSizeInBytes_avx2); ++} ++ ++void KeccakP1600times4_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset) { ++ ((unsigned char*)states)[instanceIndex*8 + (offset/8)*4*8 + offset%8] ^= byte; + } + + void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length) +@@ -97,14 +117,16 @@ unsigned int sizeLeft = length; unsigned int lanePosition = offset/SnP_laneLengthInBytes; unsigned int offsetInLane = offset%SnP_laneLengthInBytes; @@ -66,7 +74,7 @@ memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane); statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; sizeLeft -= bytesInLane; -@@ -113,7 +131,7 @@ +@@ -113,7 +135,7 @@ } while(sizeLeft >= SnP_laneLengthInBytes) { @@ -75,7 +83,7 @@ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; sizeLeft -= SnP_laneLengthInBytes; lanePosition++; -@@ -121,7 +139,7 @@ +@@ -121,7 +143,7 @@ } if (sizeLeft > 0) { @@ -84,7 +92,7 @@ memcpy(&lane, curData, sizeLeft); statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; } -@@ -131,18 +149,21 @@ +@@ -131,18 +153,21 @@ { V256 *stateAsLanes = (V256 *)states; unsigned int i; @@ -116,7 +124,7 @@ INTLEAVE(),\ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ -@@ -191,7 +212,7 @@ +@@ -191,7 +216,7 @@ } while(sizeLeft >= SnP_laneLengthInBytes) { @@ -125,7 +133,7 @@ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane; sizeLeft -= SnP_laneLengthInBytes; lanePosition++; -@@ -207,23 +228,26 @@ +@@ -207,23 +232,26 @@ { V256 *stateAsLanes = (V256 *)states; unsigned int i; @@ -167,7 +175,7 @@ if ( laneCount >= 16 ) { OverWr4( 0 ); -@@ -277,14 +301,14 @@ +@@ -277,14 +305,14 @@ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; if (bytesInLane > sizeLeft) bytesInLane = sizeLeft; @@ -184,7 +192,7 @@ sizeLeft -= SnP_laneLengthInBytes; lanePosition++; curData += SnP_laneLengthInBytes; -@@ -297,20 +321,20 @@ +@@ -297,20 +325,20 @@ void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset) { @@ -213,7 +221,7 @@ #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \ lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \ -@@ -368,7 +392,7 @@ +@@ -368,7 +396,7 @@ } while(sizeLeft >= SnP_laneLengthInBytes) { @@ -222,7 +230,7 @@ sizeLeft -= SnP_laneLengthInBytes; lanePosition++; curInput += SnP_laneLengthInBytes; -@@ -386,25 +410,24 @@ +@@ -386,25 +414,24 @@ void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset) { @@ -260,7 +268,16 @@ #define ExtrXor4( argIndex ) \ lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\ -@@ -900,19 +923,17 @@ +@@ -692,7 +719,7 @@ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ + \ + +-static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t KeccakF1600RoundConstants[24] = { ++static ALIGN(KeccakP1600times4_statesAlignment_avx2) const uint64_t KeccakF1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, +@@ -900,19 +927,17 @@ } return (const unsigned char *)curData0 - dataStart; #else @@ -285,7 +302,7 @@ XOR_In( Aba, 0 ); XOR_In( Abe, 1 ); XOR_In( Abi, 2 ); -@@ -943,11 +964,10 @@ +@@ -943,11 +968,10 @@ dataByteLen -= laneOffsetSerial*8; } copyToState(statesAsLanes, A) @@ -298,7 +315,7 @@ const unsigned char *dataStart = data; while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { -@@ -956,7 +976,7 @@ +@@ -956,7 +980,7 @@ data += laneOffsetSerial*8; dataByteLen -= laneOffsetSerial*8; } @@ -307,7 +324,7 @@ } } -@@ -1002,19 +1022,17 @@ +@@ -1002,19 +1026,17 @@ } return (const unsigned char *)curData0 - dataStart; #else @@ -332,7 +349,7 @@ XOR_In( Aba, 0 ); XOR_In( Abe, 1 ); XOR_In( Abi, 2 ); -@@ -1045,11 +1063,10 @@ +@@ -1045,11 +1067,10 @@ dataByteLen -= laneOffsetSerial*8; } copyToState(statesAsLanes, A) @@ -345,7 +362,7 @@ const unsigned char *dataStart = data; while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { -@@ -1058,10 +1075,11 @@ +@@ -1058,10 +1079,11 @@ data += laneOffsetSerial*8; dataByteLen -= laneOffsetSerial*8; } @@ -358,7 +375,7 @@ /* ------------------------------------------------------------------------- */ #define UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3) \ -@@ -1319,3 +1337,4 @@ +@@ -1319,3 +1341,4 @@ #undef Kravatte_Roll #undef UNINTLEAVEa #undef INTLEAVEa diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SnP.h b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SnP.h index 8d32537e1..7bc62808e 100644 --- a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SnP.h +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_AVX2_KeccakP-1600-times4-SnP.h @@ -1,13 +1,93 @@ --- upstream/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +++ upstream-patched/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h -@@ -49,7 +49,9 @@ - void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); - size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); - size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); -+#if 0 - size_t KeccakP1600times4_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen); - size_t KeccakP1600times4_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen); +@@ -21,35 +21,78 @@ + #include + #include "SIMD256-config.h" + +-#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" +-#define KeccakP1600times4_statesSizeInBytes 800 +-#define KeccakP1600times4_statesAlignment 32 +-#define KeccakF1600times4_FastLoop_supported +-#define KeccakP1600times4_12rounds_FastLoop_supported +-#define KeccakF1600times4_FastKravatte_supported +- + #include + ++#define KeccakP1600times4_implementation_avx2 "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" ++#define KeccakP1600times4_statesSizeInBytes_avx2 800 ++#define KeccakP1600times4_statesAlignment_avx2 32 ++#define KeccakF1600times4_FastLoop_supported_avx2 ++#define KeccakP1600times4_12rounds_FastLoop_supported_avx2 ++#define KeccakF1600times4_FastKravatte_supported_avx2 ++ ++#if defined(ADD_SYMBOL_SUFFIX) ++#define KECCAKTIMES4_IMPL_NAMESPACE(x) x##_avx2 ++#else ++#define KECCAKTIMES4_IMPL_NAMESPACE(x) x ++#define KeccakP1600times4_implementation KeccakP1600times4_implementation_avx2 ++#define KeccakP1600times4_statesSizeInBytes KeccakP1600times4_statesSizeInBytes_avx2 ++#define KeccakP1600times4_statesAlignment KeccakP1600times4_statesAlignment_avx2 ++#define KeccakF1600times4_FastLoop_supported KeccakF1600times4_FastLoop_supported_avx2 ++#define KeccakP1600times4_12rounds_FastLoop_supported KeccakP1600times4_12rounds_FastLoop_supported_avx2 ++#define KeccakF1600times4_FastKravatte_supported KeccakF1600times4_FastKravatte_supported_avx2 +#endif ++ + #define KeccakP1600times4_StaticInitialize() ++ ++#define KeccakP1600times4_InitializeAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_InitializeAll) + void KeccakP1600times4_InitializeAll(void *states); +-#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \ +- ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte) ++ ++#define KeccakP1600times4_AddByte KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_AddByte) ++void KeccakP1600times4_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset); ++ ++#define KeccakP1600times4_AddBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_AddBytes) + void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_AddLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_AddLanesAll) + void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); ++ ++#define KeccakP1600times4_OverwriteBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_OverwriteBytes) + void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_OverwriteLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_OverwriteLanesAll) + void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); ++ ++#define KeccakP1600times4_OverwriteWithZeroes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes) + void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount); ++ ++#define KeccakP1600times4_PermuteAll_4rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_4rounds) + void KeccakP1600times4_PermuteAll_4rounds(void *states); ++ ++#define KeccakP1600times4_PermuteAll_6rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_6rounds) + void KeccakP1600times4_PermuteAll_6rounds(void *states); ++ ++#define KeccakP1600times4_PermuteAll_12rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds) + void KeccakP1600times4_PermuteAll_12rounds(void *states); ++ ++#define KeccakP1600times4_PermuteAll_24rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) + void KeccakP1600times4_PermuteAll_24rounds(void *states); ++ ++#define KeccakP1600times4_ExtractBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractBytes) + void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_ExtractLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractLanesAll) + void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset); ++ ++#define KeccakP1600times4_ExtractAndAddBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes) + void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_ExtractAndAddLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll) + void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); ++ ++#define KeccakF1600times4_FastLoop_Absorb KECCAKTIMES4_IMPL_NAMESPACE(KeccakF1600times4_FastLoop_Absorb) + size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); ++ ++#define KeccakP1600times4_12rounds_FastLoop_Absorb KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb) + size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); +-size_t KeccakP1600times4_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen); +-size_t KeccakP1600times4_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen); #endif diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_fallback-on1_KeccakP-1600-times4-SnP.h b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_fallback-on1_KeccakP-1600-times4-SnP.h new file mode 100644 index 000000000..029b8e2ab --- /dev/null +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_fallback-on1_KeccakP-1600-times4-SnP.h @@ -0,0 +1,69 @@ +--- upstream/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h ++++ upstream-patched/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +@@ -20,26 +20,63 @@ + + #include "KeccakP-1600-SnP.h" + ++#if defined(ADD_SYMBOL_SUFFIX) ++#define KECCAKTIMES4_NAMESPACE(x) KeccakP1600times4_##x##_serial ++#else ++#define KECCAKTIMES4_NAMESPACE(x) KeccakP1600times4_##x ++#endif ++ + #define KeccakP1600times4_implementation "fallback on serial implementation (" KeccakP1600_implementation ")" +-#define KeccakP1600times4_statesSizeInBytes (((KeccakP1600_stateSizeInBytes+(KeccakP1600_stateAlignment-1))/KeccakP1600_stateAlignment)*KeccakP1600_stateAlignment*4) +-#define KeccakP1600times4_statesAlignment KeccakP1600_stateAlignment + #define KeccakP1600times4_isFallback ++#define KeccakP1600times4_statesAlignment KeccakP1600_stateAlignment ++#define KeccakP1600times4_statesSizeInBytes (((KeccakP1600_stateSizeInBytes+(KeccakP1600_stateAlignment-1))/KeccakP1600_stateAlignment)*KeccakP1600_stateAlignment*4) + ++#define KeccakP1600times4_StaticInitialize KECCAKTIMES4_NAMESPACE(KeccakP1600times4_StaticInitialize) + void KeccakP1600times4_StaticInitialize( void ); ++ ++#define KeccakP1600times4_InitializeAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_InitializeAll) + void KeccakP1600times4_InitializeAll(void *states); ++ ++#define KeccakP1600times4_AddByte KECCAKTIMES4_NAMESPACE(KeccakP1600times4_AddByte) + void KeccakP1600times4_AddByte(void *states, unsigned int instanceIndex, unsigned char data, unsigned int offset); ++ ++#define KeccakP1600times4_AddBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_AddBytes) + void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_AddLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_AddLanesAll) + void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); ++ ++#define KeccakP1600times4_OverwriteBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_OverwriteBytes) + void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_OverwriteLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_OverwriteLanesAll) + void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); ++ ++#define KeccakP1600times4_OverwriteWithZeroes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes) + void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount); ++ ++#define KeccakP1600times4_PermuteAll_4rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_4rounds) + void KeccakP1600times4_PermuteAll_4rounds(void *states); ++ ++#define KeccakP1600times4_PermuteAll_6rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_6rounds) + void KeccakP1600times4_PermuteAll_6rounds(void *states); ++ ++#define KeccakP1600times4_PermuteAll_12rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds) + void KeccakP1600times4_PermuteAll_12rounds(void *states); ++ ++#define KeccakP1600times4_PermuteAll_24rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) + void KeccakP1600times4_PermuteAll_24rounds(void *states); ++ ++#define KeccakP1600times4_ExtractBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractBytes) + void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_ExtractLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractLanesAll) + void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset); ++ ++#define KeccakP1600times4_ExtractAndAddBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes) + void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600times4_ExtractAndAddLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll) + void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); + + #endif + diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_fallback-on1_KeccakP-1600-times4-on1.c b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_fallback-on1_KeccakP-1600-times4-on1.c new file mode 100644 index 000000000..bdd7d45a0 --- /dev/null +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600-times4_fallback-on1_KeccakP-1600-times4-on1.c @@ -0,0 +1,40 @@ +--- upstream/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c ++++ upstream-patched/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +@@ -22,10 +22,28 @@ + #include "KeccakP-1600-SnP.h" + + #define prefix KeccakP1600times4 ++#define suffix serial + #define PlSnP_baseParallelism 1 + #define PlSnP_targetParallelism 4 + #define SnP_laneLengthInBytes 8 +-#define SnP KeccakP1600 ++#define SnP_prefix KeccakP1600 ++#define SnP_suffix KECCAK_SYMBOL_SUFFIX ++ ++#if defined(ADD_SYMBOL_SUFFIX) ++#define JOIN0(a,b,c) a##_##b##_##c ++#define JOIN(a,b) JOIN0(a,b,c) ++ ++#define SnP_Permute JOIN(SnP_prefix, Permute_24rounds, SnP_suffix) ++#define SnP_Permute_12rounds JOIN(SnP_prefix, Permute_12rounds, SnP_suffix) ++#define SnP_Permute_Nrounds JOIN(SnP_prefix, Permute_Nrounds, SnP_suffix) ++#define PlSnP_PermuteAll JOIN(prefix, PermuteAll_24rounds, suffix) ++#define PlSnP_PermuteAll_12rounds JOIN(prefix, PermuteAll_12rounds, suffix) ++#define PlSnP_PermuteAll_6rounds JOIN(prefix, PermuteAll_6rounds, suffix) ++#define PlSnP_PermuteAll_4rounds JOIN(prefix, PermuteAll_4rounds, suffix) ++ ++#undef JOIN0 ++#undef JOIN ++#else + #define SnP_Permute KeccakP1600_Permute_24rounds + #define SnP_Permute_12rounds KeccakP1600_Permute_12rounds + #define SnP_Permute_Nrounds KeccakP1600_Permute_Nrounds +@@ -33,5 +51,6 @@ + #define PlSnP_PermuteAll_12rounds KeccakP1600times4_PermuteAll_12rounds + #define PlSnP_PermuteAll_6rounds KeccakP1600times4_PermuteAll_6rounds + #define PlSnP_PermuteAll_4rounds KeccakP1600times4_PermuteAll_4rounds ++#endif + + #include "PlSnP-Fallback.inc" + diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_AVX2_KeccakP-1600-AVX2.s b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_AVX2_KeccakP-1600-AVX2.s new file mode 100644 index 000000000..62387961d --- /dev/null +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_AVX2_KeccakP-1600-AVX2.s @@ -0,0 +1,38 @@ +--- upstream/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s ++++ upstream-patched/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s +@@ -15,6 +15,34 @@ + # The rest of the code was written by Ronny Van Keer. + # Adaptations for macOS by Stéphane Léon. + ++#ifdef ADD_SYMBOL_SUFFIX ++#define KeccakP1600_Initialize KeccakP1600_Initialize_avx2 ++#define KeccakP1600_AddByte KeccakP1600_AddByte_avx2 ++#define KeccakP1600_AddBytes KeccakP1600_AddBytes_avx2 ++#define KeccakP1600_OverwriteBytes KeccakP1600_OverwriteBytes_avx2 ++#define KeccakP1600_OverwriteWithZeroes KeccakP1600_OverwriteWithZeroes_avx2 ++#define KeccakP1600_Permute_Nrounds KeccakP1600_Permute_Nrounds_avx2 ++#define KeccakP1600_Permute_12rounds KeccakP1600_Permute_12rounds_avx2 ++#define KeccakP1600_Permute_24rounds KeccakP1600_Permute_24rounds_avx2 ++#define KeccakP1600_ExtractBytes KeccakP1600_ExtractBytes_avx2 ++#define KeccakP1600_ExtractAndAddBytes KeccakP1600_ExtractAndAddBytes_avx2 ++#define KeccakF1600_FastLoop_Absorb KeccakF1600_FastLoop_Absorb_avx2 ++#define KeccakP1600_12rounds_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb_avx2 ++ ++#define _KeccakP1600_Initialize _KeccakP1600_Initialize_avx2 ++#define _KeccakP1600_AddByte _KeccakP1600_AddByte_avx2 ++#define _KeccakP1600_AddBytes _KeccakP1600_AddBytes_avx2 ++#define _KeccakP1600_OverwriteBytes _KeccakP1600_OverwriteBytes_avx2 ++#define _KeccakP1600_OverwriteWithZeroes _KeccakP1600_OverwriteWithZeroes_avx2 ++#define _KeccakP1600_Permute_Nrounds _KeccakP1600_Permute_Nrounds_avx2 ++#define _KeccakP1600_Permute_12rounds _KeccakP1600_Permute_12rounds_avx2 ++#define _KeccakP1600_Permute_24rounds _KeccakP1600_Permute_24rounds_avx2 ++#define _KeccakP1600_ExtractBytes _KeccakP1600_ExtractBytes_avx2 ++#define _KeccakP1600_ExtractAndAddBytes _KeccakP1600_ExtractAndAddBytes_avx2 ++#define _KeccakF1600_FastLoop_Absorb _KeccakF1600_FastLoop_Absorb_avx2 ++#define _KeccakP1600_12rounds_FastLoop_Absorb _KeccakP1600_12rounds_FastLoop_Absorb_avx2 ++#endif ++ + .text + + # ----------------------------------------------------------------------------- + diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_AVX2_KeccakP-1600-SnP.h b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_AVX2_KeccakP-1600-SnP.h new file mode 100644 index 000000000..42d29cd83 --- /dev/null +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_AVX2_KeccakP-1600-SnP.h @@ -0,0 +1,69 @@ +--- upstream/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h ++++ upstream-patched/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +@@ -23,24 +23,59 @@ + + #include + +-#define KeccakP1600_implementation "AVX2 optimized implementation" +-#define KeccakP1600_stateSizeInBytes 200 +-#define KeccakP1600_stateAlignment 32 +-#define KeccakF1600_FastLoop_supported +-#define KeccakP1600_12rounds_FastLoop_supported ++#define KeccakP1600_implementation_avx2 "AVX2 optimized implementation" ++#define KeccakP1600_stateSizeInBytes_avx2 200 ++#define KeccakP1600_stateAlignment_avx2 32 ++#define KeccakF1600_FastLoop_supported_avx2 + +-#define KeccakP1600_StaticInitialize() ++#if defined(ADD_SYMBOL_SUFFIX) ++#define KECCAK_SYMBOL_SUFFIX avx2 ++#define KECCAK_IMPL_NAMESPACE(x) x##_avx2 ++#else ++#define KECCAK_IMPL_NAMESPACE(x) x ++#define KeccakP1600_implementation KeccakP1600_implementation_avx2 ++#define KeccakP1600_stateSizeInBytes KeccakP1600_stateSizeInBytes_avx2 ++#define KeccakP1600_stateAlignment KeccakP1600_stateAlignment_avx2 ++#define KeccakF1600_FastLoop_supported KeccakF1600_FastLoop_supported_avx2 ++#endif ++ ++#define KeccakP1600_StaticInitialize KECCAK_IMPL_NAMESPACE(KeccakP1600_StaticInitialize) ++void KeccakP1600_StaticInitialize(void); ++ ++#define KeccakP1600_Initialize KECCAK_IMPL_NAMESPACE(KeccakP1600_Initialize) + void KeccakP1600_Initialize(void *state); ++ ++#define KeccakP1600_AddByte KECCAK_IMPL_NAMESPACE(KeccakP1600_AddByte) + void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); ++ ++#define KeccakP1600_AddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_AddBytes) + void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600_OverwriteBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteBytes) + void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600_OverwriteWithZeroes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteWithZeroes) + void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount); ++ ++#define KeccakP1600_Permute_Nrounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_Nrounds) + void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds); ++ ++#define KeccakP1600_Permute_12rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_12rounds) + void KeccakP1600_Permute_12rounds(void *state); ++ ++#define KeccakP1600_Permute_24rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_24rounds) + void KeccakP1600_Permute_24rounds(void *state); ++ ++#define KeccakP1600_ExtractBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractBytes) + void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600_ExtractAndAddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractAndAddBytes) + void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); ++ ++#define KeccakF1600_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakF1600_FastLoop_Absorb) + size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); ++ ++#define KeccakP1600_12rounds_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakP1600_12rounds_FastLoop_Absorb) + size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + + #endif + diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_plain-64bits_KeccakP-1600-SnP.h b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_plain-64bits_KeccakP-1600-SnP.h new file mode 100644 index 000000000..96bde8970 --- /dev/null +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_plain-64bits_KeccakP-1600-SnP.h @@ -0,0 +1,79 @@ +--- upstream/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h ++++ upstream-patched/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +@@ -24,31 +24,63 @@ + #include "brg_endian.h" + #include "KeccakP-1600-opt64-config.h" + +-#define KeccakP1600_implementation "generic 64-bit optimized implementation (" KeccakP1600_implementation_config ")" +-#define KeccakP1600_stateSizeInBytes 200 +-#define KeccakP1600_stateAlignment 8 +-#define KeccakF1600_FastLoop_supported +-#define KeccakP1600_12rounds_FastLoop_supported +- + #include + +-#define KeccakP1600_StaticInitialize() +-void KeccakP1600_Initialize(void *state); +-#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +-#define KeccakP1600_AddByte(state, byte, offset) \ +- ((unsigned char*)(state))[(offset)] ^= (byte) ++#define KeccakP1600_implementation_plain64 "generic 64-bit optimized implementation (" KeccakP1600_implementation_config ")" ++#define KeccakP1600_stateSizeInBytes_plain64 200 ++#define KeccakP1600_stateAlignment_plain64 8 ++#define KeccakF1600_FastLoop_supported_plain64 ++#define KeccakP1600_12rounds_FastLoop_supported_plain64 ++ ++#if defined(ADD_SYMBOL_SUFFIX) ++#define KECCAK_SYMBOL_SUFFIX plain64 ++#define KECCAK_IMPL_NAMESPACE(x) x##_plain64 + #else +-void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); ++#define KECCAK_IMPL_NAMESPACE(x) x ++#define KeccakP1600_implementation KeccakP1600_implementation_plain64 ++#define KeccakP1600_stateSizeInBytes KeccakP1600_stateSizeInBytes_plain64 ++#define KeccakP1600_stateAlignment KeccakP1600_stateAlignment_plain64 ++#define KeccakF1600_FastLoop_supported KeccakF1600_FastLoop_supported_plain64 ++#define KeccakP1600_12rounds_FastLoop_supported KeccakP1600_12rounds_FastLoop_supported_plain64 + #endif ++ ++#define KeccakP1600_StaticInitialize KECCAK_IMPL_NAMESPACE(KeccakP1600_StaticInitialize) ++void KeccakP1600_StaticInitialize(void); ++ ++#define KeccakP1600_Initialize KECCAK_IMPL_NAMESPACE(KeccakP1600_Initialize) ++void KeccakP1600_Initialize(void *state); ++ ++#define KeccakP1600_AddByte KECCAK_IMPL_NAMESPACE(KeccakP1600_AddByte) ++void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); ++ ++#define KeccakP1600_AddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_AddBytes) + void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600_OverwriteBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteBytes) + void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600_OverwriteWithZeroes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteWithZeroes) + void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount); ++ ++#define KeccakP1600_Permute_Nrounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_Nrounds) + void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds); ++ ++#define KeccakP1600_Permute_12rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_12rounds) + void KeccakP1600_Permute_12rounds(void *state); ++ ++#define KeccakP1600_Permute_24rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_24rounds) + void KeccakP1600_Permute_24rounds(void *state); ++ ++#define KeccakP1600_ExtractBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractBytes) + void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); ++ ++#define KeccakP1600_ExtractAndAddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractAndAddBytes) + void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); ++ ++#define KeccakF1600_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakF1600_FastLoop_Absorb) + size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); ++ ++#define KeccakP1600_12rounds_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakP1600_12rounds_FastLoop_Absorb) + size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + + #endif + diff --git a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_plain-64bits_KeccakP-1600-opt64.c b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_plain-64bits_KeccakP-1600-opt64.c index 560afd2c3..cbd44697e 100644 --- a/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_plain-64bits_KeccakP-1600-opt64.c +++ b/scripts/copy_from_xkcp/patches/lib_low_KeccakP-1600_plain-64bits_KeccakP-1600-opt64.c @@ -1,6 +1,25 @@ --- upstream/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +++ upstream-patched/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c -@@ -137,27 +137,27 @@ +@@ -26,6 +26,7 @@ + #include + #include + #include "brg_endian.h" ++#include "KeccakP-1600-SnP.h" + #include "KeccakP-1600-opt64-config.h" + + #if defined(KeccakP1600_useLaneComplementing) +@@ -82,6 +83,10 @@ + + /* ---------------------------------------------------------------- */ + ++void KeccakP1600_StaticInitialize(void) { } ++ ++/* ---------------------------------------------------------------- */ ++ + void KeccakP1600_Initialize(void *state) + { + memset(state, 0, 200); +@@ -137,27 +142,27 @@ { /* Otherwise... */ for( ; (i+8)<=laneCount; i+=8) { @@ -43,7 +62,20 @@ } } #else -@@ -235,11 +235,11 @@ +@@ -179,7 +184,11 @@ + + /* ---------------------------------------------------------------- */ + +-#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) ++#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) ++void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) { ++ ((unsigned char*)state)[offset] ^= byte; ++} ++#else + void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) + { + uint64_t lane = byte; +@@ -235,11 +244,11 @@ #ifdef KeccakP1600_useLaneComplementing unsigned int lanePosition; @@ -59,7 +91,7 @@ #else memcpy(state, data, laneCount*8); #endif -@@ -282,7 +282,7 @@ +@@ -282,7 +291,7 @@ for(lanePosition=0; lanePosition + $) +endif() + +# We currently do not have a SHA3 AVX2 implementation that works on Windows +if(BUILD_AVX2 AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") + add_library(xkcp_low_keccakp_1600_avx2 OBJECT KeccakP-1600/avx2/KeccakP-1600-AVX2.S) + + add_library(xkcp_low_keccakp_1600times4_avx2 OBJECT KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c) + target_include_directories(xkcp_low_keccakp_1600times4_avx2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/avx2) + target_compile_options(xkcp_low_keccakp_1600times4_avx2 PRIVATE -mavx2) + if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") target_compile_options(xkcp_low_keccakp_1600_avx2 PRIVATE -Wa,-defsym,macOS=1) endif() - target_include_directories(xkcp_low_keccakp_1600_avx2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/avx2) - set(_XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} $) - set(_XKCP_LOW_INCLUDE_DIRS ${_XKCP_LOW_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/avx2) - add_library(xkcp_low_keccakp_1600times4_avx2 OBJECT KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c) - target_compile_options(xkcp_low_keccakp_1600times4_avx2 PRIVATE -mavx2) - target_include_directories(xkcp_low_keccakp_1600times4_avx2 PRIVATE ${_XKCP_LOW_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/avx2) - set(_XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} $) - set(_XKCP_LOW_INCLUDE_DIRS ${_XKCP_LOW_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/avx2) -else() - add_library(xkcp_low_keccakp_1600_plain64 OBJECT KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c) - target_include_directories(xkcp_low_keccakp_1600_plain64 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/plain-64bits) - set(_XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} $) - set(_XKCP_LOW_INCLUDE_DIRS ${_XKCP_LOW_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600/plain-64bits) + if(OQS_DIST_X86_64_BUILD) + target_compile_definitions(xkcp_low_keccakp_1600_avx2 PRIVATE ADD_SYMBOL_SUFFIX) + target_compile_definitions(xkcp_low_keccakp_1600times4_avx2 PRIVATE ADD_SYMBOL_SUFFIX) + endif() - add_library(xkcp_low_keccakp_1600times4_plain64 OBJECT KeccakP-1600times4/serial/KeccakP-1600-times4-on1.c) - target_include_directories(xkcp_low_keccakp_1600times4_plain64 PRIVATE ${_XKCP_LOW_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/serial) - set(_XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} $) - set(_XKCP_LOW_INCLUDE_DIRS ${_XKCP_LOW_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/KeccakP-1600times4/serial) + set(_XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} $ + $) endif() set(XKCP_LOW_OBJS ${_XKCP_LOW_OBJS} PARENT_SCOPE) -set(XKCP_LOW_INCLUDE_DIRS ${_XKCP_LOW_INCLUDE_DIRS} PARENT_SCOPE) diff --git a/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-AVX2.s b/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-AVX2.S similarity index 96% rename from src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-AVX2.s rename to src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-AVX2.S index 55c7a7d3c..26a649014 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-AVX2.s +++ b/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-AVX2.S @@ -15,6 +15,34 @@ # The rest of the code was written by Ronny Van Keer. # Adaptations for macOS by Stéphane Léon. +#ifdef ADD_SYMBOL_SUFFIX +#define KeccakP1600_Initialize KeccakP1600_Initialize_avx2 +#define KeccakP1600_AddByte KeccakP1600_AddByte_avx2 +#define KeccakP1600_AddBytes KeccakP1600_AddBytes_avx2 +#define KeccakP1600_OverwriteBytes KeccakP1600_OverwriteBytes_avx2 +#define KeccakP1600_OverwriteWithZeroes KeccakP1600_OverwriteWithZeroes_avx2 +#define KeccakP1600_Permute_Nrounds KeccakP1600_Permute_Nrounds_avx2 +#define KeccakP1600_Permute_12rounds KeccakP1600_Permute_12rounds_avx2 +#define KeccakP1600_Permute_24rounds KeccakP1600_Permute_24rounds_avx2 +#define KeccakP1600_ExtractBytes KeccakP1600_ExtractBytes_avx2 +#define KeccakP1600_ExtractAndAddBytes KeccakP1600_ExtractAndAddBytes_avx2 +#define KeccakF1600_FastLoop_Absorb KeccakF1600_FastLoop_Absorb_avx2 +#define KeccakP1600_12rounds_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb_avx2 + +#define _KeccakP1600_Initialize _KeccakP1600_Initialize_avx2 +#define _KeccakP1600_AddByte _KeccakP1600_AddByte_avx2 +#define _KeccakP1600_AddBytes _KeccakP1600_AddBytes_avx2 +#define _KeccakP1600_OverwriteBytes _KeccakP1600_OverwriteBytes_avx2 +#define _KeccakP1600_OverwriteWithZeroes _KeccakP1600_OverwriteWithZeroes_avx2 +#define _KeccakP1600_Permute_Nrounds _KeccakP1600_Permute_Nrounds_avx2 +#define _KeccakP1600_Permute_12rounds _KeccakP1600_Permute_12rounds_avx2 +#define _KeccakP1600_Permute_24rounds _KeccakP1600_Permute_24rounds_avx2 +#define _KeccakP1600_ExtractBytes _KeccakP1600_ExtractBytes_avx2 +#define _KeccakP1600_ExtractAndAddBytes _KeccakP1600_ExtractAndAddBytes_avx2 +#define _KeccakF1600_FastLoop_Absorb _KeccakF1600_FastLoop_Absorb_avx2 +#define _KeccakP1600_12rounds_FastLoop_Absorb _KeccakP1600_12rounds_FastLoop_Absorb_avx2 +#endif + .text # ----------------------------------------------------------------------------- diff --git a/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-SnP.h b/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-SnP.h index 1877ca72b..ca31ffbd5 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-SnP.h +++ b/src/common/sha3/xkcp_low/KeccakP-1600/avx2/KeccakP-1600-SnP.h @@ -23,24 +23,59 @@ Please refer to SnP-documentation.h for more details. #include -#define KeccakP1600_implementation "AVX2 optimized implementation" -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 32 -#define KeccakF1600_FastLoop_supported -#define KeccakP1600_12rounds_FastLoop_supported +#define KeccakP1600_implementation_avx2 "AVX2 optimized implementation" +#define KeccakP1600_stateSizeInBytes_avx2 200 +#define KeccakP1600_stateAlignment_avx2 32 +#define KeccakF1600_FastLoop_supported_avx2 -#define KeccakP1600_StaticInitialize() +#if defined(ADD_SYMBOL_SUFFIX) +#define KECCAK_SYMBOL_SUFFIX avx2 +#define KECCAK_IMPL_NAMESPACE(x) x##_avx2 +#else +#define KECCAK_IMPL_NAMESPACE(x) x +#define KeccakP1600_implementation KeccakP1600_implementation_avx2 +#define KeccakP1600_stateSizeInBytes KeccakP1600_stateSizeInBytes_avx2 +#define KeccakP1600_stateAlignment KeccakP1600_stateAlignment_avx2 +#define KeccakF1600_FastLoop_supported KeccakF1600_FastLoop_supported_avx2 +#endif + +#define KeccakP1600_StaticInitialize KECCAK_IMPL_NAMESPACE(KeccakP1600_StaticInitialize) +void KeccakP1600_StaticInitialize(void); + +#define KeccakP1600_Initialize KECCAK_IMPL_NAMESPACE(KeccakP1600_Initialize) void KeccakP1600_Initialize(void *state); + +#define KeccakP1600_AddByte KECCAK_IMPL_NAMESPACE(KeccakP1600_AddByte) void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); + +#define KeccakP1600_AddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_AddBytes) void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600_OverwriteBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteBytes) void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600_OverwriteWithZeroes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteWithZeroes) void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount); + +#define KeccakP1600_Permute_Nrounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_Nrounds) void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds); + +#define KeccakP1600_Permute_12rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_12rounds) void KeccakP1600_Permute_12rounds(void *state); + +#define KeccakP1600_Permute_24rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_24rounds) void KeccakP1600_Permute_24rounds(void *state); + +#define KeccakP1600_ExtractBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractBytes) void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600_ExtractAndAddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractAndAddBytes) void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); + +#define KeccakF1600_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakF1600_FastLoop_Absorb) size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +#define KeccakP1600_12rounds_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakP1600_12rounds_FastLoop_Absorb) size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); #endif diff --git a/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h b/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h index 1f811b0bf..078fbc36a 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +++ b/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h @@ -24,31 +24,63 @@ Please refer to SnP-documentation.h for more details. #include "brg_endian.h" #include "KeccakP-1600-opt64-config.h" -#define KeccakP1600_implementation "generic 64-bit optimized implementation (" KeccakP1600_implementation_config ")" -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 8 -#define KeccakF1600_FastLoop_supported -#define KeccakP1600_12rounds_FastLoop_supported - #include -#define KeccakP1600_StaticInitialize() -void KeccakP1600_Initialize(void *state); -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define KeccakP1600_AddByte(state, byte, offset) \ - ((unsigned char*)(state))[(offset)] ^= (byte) +#define KeccakP1600_implementation_plain64 "generic 64-bit optimized implementation (" KeccakP1600_implementation_config ")" +#define KeccakP1600_stateSizeInBytes_plain64 200 +#define KeccakP1600_stateAlignment_plain64 8 +#define KeccakF1600_FastLoop_supported_plain64 +#define KeccakP1600_12rounds_FastLoop_supported_plain64 + +#if defined(ADD_SYMBOL_SUFFIX) +#define KECCAK_SYMBOL_SUFFIX plain64 +#define KECCAK_IMPL_NAMESPACE(x) x##_plain64 #else -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); +#define KECCAK_IMPL_NAMESPACE(x) x +#define KeccakP1600_implementation KeccakP1600_implementation_plain64 +#define KeccakP1600_stateSizeInBytes KeccakP1600_stateSizeInBytes_plain64 +#define KeccakP1600_stateAlignment KeccakP1600_stateAlignment_plain64 +#define KeccakF1600_FastLoop_supported KeccakF1600_FastLoop_supported_plain64 +#define KeccakP1600_12rounds_FastLoop_supported KeccakP1600_12rounds_FastLoop_supported_plain64 #endif + +#define KeccakP1600_StaticInitialize KECCAK_IMPL_NAMESPACE(KeccakP1600_StaticInitialize) +void KeccakP1600_StaticInitialize(void); + +#define KeccakP1600_Initialize KECCAK_IMPL_NAMESPACE(KeccakP1600_Initialize) +void KeccakP1600_Initialize(void *state); + +#define KeccakP1600_AddByte KECCAK_IMPL_NAMESPACE(KeccakP1600_AddByte) +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); + +#define KeccakP1600_AddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_AddBytes) void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600_OverwriteBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteBytes) void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600_OverwriteWithZeroes KECCAK_IMPL_NAMESPACE(KeccakP1600_OverwriteWithZeroes) void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount); + +#define KeccakP1600_Permute_Nrounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_Nrounds) void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds); + +#define KeccakP1600_Permute_12rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_12rounds) void KeccakP1600_Permute_12rounds(void *state); + +#define KeccakP1600_Permute_24rounds KECCAK_IMPL_NAMESPACE(KeccakP1600_Permute_24rounds) void KeccakP1600_Permute_24rounds(void *state); + +#define KeccakP1600_ExtractBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractBytes) void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600_ExtractAndAddBytes KECCAK_IMPL_NAMESPACE(KeccakP1600_ExtractAndAddBytes) void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); + +#define KeccakF1600_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakF1600_FastLoop_Absorb) size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +#define KeccakP1600_12rounds_FastLoop_Absorb KECCAK_IMPL_NAMESPACE(KeccakP1600_12rounds_FastLoop_Absorb) size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); #endif diff --git a/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c b/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c index bc96ea64b..a01221fcd 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +++ b/src/common/sha3/xkcp_low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c @@ -26,6 +26,7 @@ Please refer to LowLevel.build for the exact list of other files it must be comb #include #include #include "brg_endian.h" +#include "KeccakP-1600-SnP.h" #include "KeccakP-1600-opt64-config.h" #define UseBebigokimisa @@ -77,6 +78,10 @@ static const uint64_t KeccakF1600RoundConstants[24] = { /* ---------------------------------------------------------------- */ +void KeccakP1600_StaticInitialize(void) { } + +/* ---------------------------------------------------------------- */ + void KeccakP1600_Initialize(void *state) { memset(state, 0, 200); ((uint64_t *)state)[ 1] = ~(uint64_t)0; @@ -167,7 +172,11 @@ void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int l /* ---------------------------------------------------------------- */ -#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) { + ((unsigned char *)state)[offset] ^= byte; +} +#else void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) { uint64_t lane = byte; lane <<= (offset % 8) * 8; diff --git a/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c b/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c index f7a8c94b8..db054ba37 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c +++ b/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SIMD256.c @@ -106,7 +106,11 @@ static void store64(unsigned char *out, uint64_t in) { } void KeccakP1600times4_InitializeAll(void *states) { - memset(states, 0, KeccakP1600times4_statesSizeInBytes); + memset(states, 0, KeccakP1600times4_statesSizeInBytes_avx2); +} + +void KeccakP1600times4_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset) { + ((unsigned char *)states)[instanceIndex * 8 + (offset / 8) * 4 * 8 + offset % 8] ^= byte; } void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length) { @@ -715,7 +719,7 @@ void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ \ -static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t KeccakF1600RoundConstants[24] = { +static ALIGN(KeccakP1600times4_statesAlignment_avx2) const uint64_t KeccakF1600RoundConstants[24] = { 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, diff --git a/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SnP.h b/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SnP.h index 5782ec859..de24ea2f7 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SnP.h +++ b/src/common/sha3/xkcp_low/KeccakP-1600times4/avx2/KeccakP-1600-times4-SnP.h @@ -21,33 +21,78 @@ Please refer to PlSnP-documentation.h for more details. #include #include "SIMD256-config.h" -#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" -#define KeccakP1600times4_statesSizeInBytes 800 -#define KeccakP1600times4_statesAlignment 32 -#define KeccakF1600times4_FastLoop_supported -#define KeccakP1600times4_12rounds_FastLoop_supported -#define KeccakF1600times4_FastKravatte_supported - #include +#define KeccakP1600times4_implementation_avx2 "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" +#define KeccakP1600times4_statesSizeInBytes_avx2 800 +#define KeccakP1600times4_statesAlignment_avx2 32 +#define KeccakF1600times4_FastLoop_supported_avx2 +#define KeccakP1600times4_12rounds_FastLoop_supported_avx2 +#define KeccakF1600times4_FastKravatte_supported_avx2 + +#if defined(ADD_SYMBOL_SUFFIX) +#define KECCAKTIMES4_IMPL_NAMESPACE(x) x##_avx2 +#else +#define KECCAKTIMES4_IMPL_NAMESPACE(x) x +#define KeccakP1600times4_implementation KeccakP1600times4_implementation_avx2 +#define KeccakP1600times4_statesSizeInBytes KeccakP1600times4_statesSizeInBytes_avx2 +#define KeccakP1600times4_statesAlignment KeccakP1600times4_statesAlignment_avx2 +#define KeccakF1600times4_FastLoop_supported KeccakF1600times4_FastLoop_supported_avx2 +#define KeccakP1600times4_12rounds_FastLoop_supported KeccakP1600times4_12rounds_FastLoop_supported_avx2 +#define KeccakF1600times4_FastKravatte_supported KeccakF1600times4_FastKravatte_supported_avx2 +#endif + #define KeccakP1600times4_StaticInitialize() + +#define KeccakP1600times4_InitializeAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_InitializeAll) void KeccakP1600times4_InitializeAll(void *states); -#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \ - ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte) + +#define KeccakP1600times4_AddByte KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_AddByte) +void KeccakP1600times4_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset); + +#define KeccakP1600times4_AddBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_AddBytes) void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_AddLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_AddLanesAll) void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); + +#define KeccakP1600times4_OverwriteBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_OverwriteBytes) void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_OverwriteLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_OverwriteLanesAll) void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); + +#define KeccakP1600times4_OverwriteWithZeroes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes) void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount); + +#define KeccakP1600times4_PermuteAll_4rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_4rounds) void KeccakP1600times4_PermuteAll_4rounds(void *states); + +#define KeccakP1600times4_PermuteAll_6rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_6rounds) void KeccakP1600times4_PermuteAll_6rounds(void *states); + +#define KeccakP1600times4_PermuteAll_12rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds) void KeccakP1600times4_PermuteAll_12rounds(void *states); + +#define KeccakP1600times4_PermuteAll_24rounds KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) void KeccakP1600times4_PermuteAll_24rounds(void *states); + +#define KeccakP1600times4_ExtractBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractBytes) void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_ExtractLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractLanesAll) void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset); + +#define KeccakP1600times4_ExtractAndAddBytes KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes) void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_ExtractAndAddLanesAll KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll) void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); + +#define KeccakF1600times4_FastLoop_Absorb KECCAKTIMES4_IMPL_NAMESPACE(KeccakF1600times4_FastLoop_Absorb) size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); + +#define KeccakP1600times4_12rounds_FastLoop_Absorb KECCAKTIMES4_IMPL_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb) size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); #endif diff --git a/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-SnP.h b/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-SnP.h index dec0c59c3..9698a922f 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-SnP.h +++ b/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-SnP.h @@ -20,26 +20,63 @@ Please refer to PlSnP-documentation.h for more details. #include "KeccakP-1600-SnP.h" -#define KeccakP1600times4_implementation "fallback on serial implementation (" KeccakP1600_implementation ")" -#define KeccakP1600times4_statesSizeInBytes (((KeccakP1600_stateSizeInBytes+(KeccakP1600_stateAlignment-1))/KeccakP1600_stateAlignment)*KeccakP1600_stateAlignment*4) -#define KeccakP1600times4_statesAlignment KeccakP1600_stateAlignment -#define KeccakP1600times4_isFallback +#if defined(ADD_SYMBOL_SUFFIX) +#define KECCAKTIMES4_NAMESPACE(x) KeccakP1600times4_##x##_serial +#else +#define KECCAKTIMES4_NAMESPACE(x) KeccakP1600times4_##x +#endif +#define KeccakP1600times4_implementation "fallback on serial implementation (" KeccakP1600_implementation ")" +#define KeccakP1600times4_isFallback +#define KeccakP1600times4_statesAlignment KeccakP1600_stateAlignment +#define KeccakP1600times4_statesSizeInBytes (((KeccakP1600_stateSizeInBytes+(KeccakP1600_stateAlignment-1))/KeccakP1600_stateAlignment)*KeccakP1600_stateAlignment*4) + +#define KeccakP1600times4_StaticInitialize KECCAKTIMES4_NAMESPACE(KeccakP1600times4_StaticInitialize) void KeccakP1600times4_StaticInitialize( void ); + +#define KeccakP1600times4_InitializeAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_InitializeAll) void KeccakP1600times4_InitializeAll(void *states); + +#define KeccakP1600times4_AddByte KECCAKTIMES4_NAMESPACE(KeccakP1600times4_AddByte) void KeccakP1600times4_AddByte(void *states, unsigned int instanceIndex, unsigned char data, unsigned int offset); + +#define KeccakP1600times4_AddBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_AddBytes) void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_AddLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_AddLanesAll) void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); + +#define KeccakP1600times4_OverwriteBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_OverwriteBytes) void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_OverwriteLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_OverwriteLanesAll) void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); + +#define KeccakP1600times4_OverwriteWithZeroes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes) void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount); + +#define KeccakP1600times4_PermuteAll_4rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_4rounds) void KeccakP1600times4_PermuteAll_4rounds(void *states); + +#define KeccakP1600times4_PermuteAll_6rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_6rounds) void KeccakP1600times4_PermuteAll_6rounds(void *states); + +#define KeccakP1600times4_PermuteAll_12rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds) void KeccakP1600times4_PermuteAll_12rounds(void *states); + +#define KeccakP1600times4_PermuteAll_24rounds KECCAKTIMES4_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) void KeccakP1600times4_PermuteAll_24rounds(void *states); + +#define KeccakP1600times4_ExtractBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractBytes) void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_ExtractLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractLanesAll) void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset); + +#define KeccakP1600times4_ExtractAndAddBytes KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes) void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); + +#define KeccakP1600times4_ExtractAndAddLanesAll KECCAKTIMES4_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll) void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); #endif diff --git a/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-on1.c b/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-on1.c index fb3888c19..629757054 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-on1.c +++ b/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/KeccakP-1600-times4-on1.c @@ -22,10 +22,28 @@ Please refer to LowLevel.build for the exact list of other files it must be comb #include "KeccakP-1600-SnP.h" #define prefix KeccakP1600times4 +#define suffix serial #define PlSnP_baseParallelism 1 #define PlSnP_targetParallelism 4 #define SnP_laneLengthInBytes 8 -#define SnP KeccakP1600 +#define SnP_prefix KeccakP1600 +#define SnP_suffix KECCAK_SYMBOL_SUFFIX + +#if defined(ADD_SYMBOL_SUFFIX) +#define JOIN0(a,b,c) a##_##b##_##c +#define JOIN(a,b) JOIN0(a,b,c) + +#define SnP_Permute JOIN(SnP_prefix, Permute_24rounds, SnP_suffix) +#define SnP_Permute_12rounds JOIN(SnP_prefix, Permute_12rounds, SnP_suffix) +#define SnP_Permute_Nrounds JOIN(SnP_prefix, Permute_Nrounds, SnP_suffix) +#define PlSnP_PermuteAll JOIN(prefix, PermuteAll_24rounds, suffix) +#define PlSnP_PermuteAll_12rounds JOIN(prefix, PermuteAll_12rounds, suffix) +#define PlSnP_PermuteAll_6rounds JOIN(prefix, PermuteAll_6rounds, suffix) +#define PlSnP_PermuteAll_4rounds JOIN(prefix, PermuteAll_4rounds, suffix) + +#undef JOIN0 +#undef JOIN +#else #define SnP_Permute KeccakP1600_Permute_24rounds #define SnP_Permute_12rounds KeccakP1600_Permute_12rounds #define SnP_Permute_Nrounds KeccakP1600_Permute_Nrounds @@ -33,5 +51,6 @@ Please refer to LowLevel.build for the exact list of other files it must be comb #define PlSnP_PermuteAll_12rounds KeccakP1600times4_PermuteAll_12rounds #define PlSnP_PermuteAll_6rounds KeccakP1600times4_PermuteAll_6rounds #define PlSnP_PermuteAll_4rounds KeccakP1600times4_PermuteAll_4rounds +#endif #include "PlSnP-Fallback.inc" diff --git a/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/PlSnP-Fallback.inc b/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/PlSnP-Fallback.inc index e3f36f97b..7006c126b 100644 --- a/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/PlSnP-Fallback.inc +++ b/src/common/sha3/xkcp_low/KeccakP-1600times4/serial/PlSnP-Fallback.inc @@ -25,46 +25,51 @@ Please refer to PlSnP-documentation.h for more details. /* expect prefix */ /* expect SnP_* */ -#define JOIN0(a, b) a ## b -#define JOIN(a, b) JOIN0(a, b) +#if defined(ADD_SYMBOL_SUFFIX) +#define JOIN0(a, b, c) a ## _ ## b ## _ ## c +#define JOIN(a, b, c) JOIN0(a, b, c) +#else +#define JOIN0(a, b) a ## _ ## b +#define JOIN(a, b, c) JOIN0(a, b) +#endif -#define PlSnP_StaticInitialize JOIN(prefix, _StaticInitialize) -#define PlSnP_InitializeAll JOIN(prefix, _InitializeAll) -#define PlSnP_AddByte JOIN(prefix, _AddByte) -#define PlSnP_AddBytes JOIN(prefix, _AddBytes) -#define PlSnP_AddLanesAll JOIN(prefix, _AddLanesAll) -#define PlSnP_OverwriteBytes JOIN(prefix, _OverwriteBytes) -#define PlSnP_OverwriteLanesAll JOIN(prefix, _OverwriteLanesAll) -#define PlSnP_OverwriteWithZeroes JOIN(prefix, _OverwriteWithZeroes) -#define PlSnP_ExtractBytes JOIN(prefix, _ExtractBytes) -#define PlSnP_ExtractLanesAll JOIN(prefix, _ExtractLanesAll) -#define PlSnP_ExtractAndAddBytes JOIN(prefix, _ExtractAndAddBytes) -#define PlSnP_ExtractAndAddLanesAll JOIN(prefix, _ExtractAndAddLanesAll) +#define PlSnP_StaticInitialize JOIN(prefix, StaticInitialize, suffix) +#define PlSnP_InitializeAll JOIN(prefix, InitializeAll, suffix) +#define PlSnP_AddByte JOIN(prefix, AddByte, suffix) +#define PlSnP_AddBytes JOIN(prefix, AddBytes, suffix) +#define PlSnP_AddLanesAll JOIN(prefix, AddLanesAll, suffix) +#define PlSnP_OverwriteBytes JOIN(prefix, OverwriteBytes, suffix) +#define PlSnP_OverwriteLanesAll JOIN(prefix, OverwriteLanesAll, suffix) +#define PlSnP_OverwriteWithZeroes JOIN(prefix, OverwriteWithZeroes, suffix) +#define PlSnP_ExtractBytes JOIN(prefix, ExtractBytes, suffix) +#define PlSnP_ExtractLanesAll JOIN(prefix, ExtractLanesAll, suffix) +#define PlSnP_ExtractAndAddBytes JOIN(prefix, ExtractAndAddBytes, suffix) +#define PlSnP_ExtractAndAddLanesAll JOIN(prefix, ExtractAndAddLanesAll, suffix) #if (PlSnP_baseParallelism == 1) - #define SnP_stateSizeInBytes JOIN(SnP, _stateSizeInBytes) - #define SnP_stateAlignment JOIN(SnP, _stateAlignment) + #define SnP_stateSizeInBytes JOIN(SnP_prefix, stateSizeInBytes, SnP_suffix) + #define SnP_stateAlignment JOIN(SnP_prefix, stateAlignment, SnP_suffix) #else - #define SnP_stateSizeInBytes JOIN(SnP, _statesSizeInBytes) - #define SnP_stateAlignment JOIN(SnP, _statesAlignment) + #define SnP_stateSizeInBytes JOIN(SnP_prefix, statesSizeInBytes, SnP_suffix) + #define SnP_stateAlignment JOIN(SnP_prefix, statesAlignment, SnP_suffix) #endif #define PlSnP_factor ((PlSnP_targetParallelism)/(PlSnP_baseParallelism)) #define SnP_stateOffset (((SnP_stateSizeInBytes+(SnP_stateAlignment-1))/SnP_stateAlignment)*SnP_stateAlignment) #define stateWithIndex(i) ((unsigned char *)states+((i)*SnP_stateOffset)) -#define SnP_StaticInitialize JOIN(SnP, _StaticInitialize) -#define SnP_Initialize JOIN(SnP, _Initialize) -#define SnP_InitializeAll JOIN(SnP, _InitializeAll) -#define SnP_AddByte JOIN(SnP, _AddByte) -#define SnP_AddBytes JOIN(SnP, _AddBytes) -#define SnP_AddLanesAll JOIN(SnP, _AddLanesAll) -#define SnP_OverwriteBytes JOIN(SnP, _OverwriteBytes) -#define SnP_OverwriteLanesAll JOIN(SnP, _OverwriteLanesAll) -#define SnP_OverwriteWithZeroes JOIN(SnP, _OverwriteWithZeroes) -#define SnP_ExtractBytes JOIN(SnP, _ExtractBytes) -#define SnP_ExtractLanesAll JOIN(SnP, _ExtractLanesAll) -#define SnP_ExtractAndAddBytes JOIN(SnP, _ExtractAndAddBytes) -#define SnP_ExtractAndAddLanesAll JOIN(SnP, _ExtractAndAddLanesAll) +#define SnP_StaticInitialize JOIN(SnP_prefix, StaticInitialize, SnP_suffix) +#define SnP_Initialize JOIN(SnP_prefix, Initialize, SnP_suffix) +#define SnP_InitializeAll JOIN(SnP_prefix, InitializeAll, SnP_suffix) +#define SnP_AddByte JOIN(SnP_prefix, AddByte, SnP_suffix) +#define SnP_AddBytes JOIN(SnP_prefix, AddBytes, SnP_suffix) +#define SnP_AddLanesAll JOIN(SnP_prefix, AddLanesAll, SnP_suffix) +#define SnP_OverwriteBytes JOIN(SnP_prefix, OverwriteBytes, SnP_suffix) +#define SnP_OverwriteLanesAll JOIN(SnP_prefix, OverwriteLanesAll, SnP_suffix) +#define SnP_OverwriteWithZeroes JOIN(SnP_prefix, OverwriteWithZeroes, SnP_suffix) +#define SnP_ExtractBytes JOIN(SnP_prefix, ExtractBytes, SnP_suffix) +#define SnP_ExtractLanesAll JOIN(SnP_prefix, ExtractLanesAll, SnP_suffix) +#define SnP_ExtractAndAddBytes JOIN(SnP_prefix, ExtractAndAddBytes, SnP_suffix) +#define SnP_ExtractAndAddLanesAll JOIN(SnP_prefix, ExtractAndAddLanesAll, SnP_suffix) void PlSnP_StaticInitialize( void ) { diff --git a/src/common/sha3/xkcp_sha3.c b/src/common/sha3/xkcp_sha3.c index 6056b0b91..504de05b1 100644 --- a/src/common/sha3/xkcp_sha3.c +++ b/src/common/sha3/xkcp_sha3.c @@ -7,9 +7,10 @@ * SPDX-License-Identifier: MIT */ -#include "KeccakP-1600-SnP.h" #include "sha3.h" +#include "xkcp_dispatch.h" + #include #include @@ -17,21 +18,60 @@ #include #include -#define KeccakF1600_Initialize KeccakP1600_Initialize -#define KeccakF1600_ExtractBytes KeccakP1600_ExtractBytes -#define KeccakF1600_AddByte KeccakP1600_AddByte -#define KeccakF1600_AddBytes KeccakP1600_AddBytes -#define KeccakF1600_StatePermute KeccakP1600_Permute_24rounds - -#define KECCAK_CTX_ALIGNMENT KeccakP1600_stateAlignment -#if KeccakP1600_stateSizeInBytes == 200 +#define KECCAK_CTX_ALIGNMENT 32 #define _KECCAK_CTX_BYTES (200+sizeof(uint64_t)) -// Round up to a multiple of alignment for C11 aligned_alloc #define KECCAK_CTX_BYTES (KECCAK_CTX_ALIGNMENT * \ ((_KECCAK_CTX_BYTES + KECCAK_CTX_ALIGNMENT - 1)/KECCAK_CTX_ALIGNMENT)) -#else -#error sha3_xkcp assumes 200 byte KeccakP1600 state + +/* The first call to Keccak_Initialize will be routed through dispatch, which + * updates all of the function pointers used below. + */ +static KeccakInitFn Keccak_Dispatch; +static KeccakInitFn *Keccak_Initialize_ptr = &Keccak_Dispatch; +static KeccakAddByteFn *Keccak_AddByte_ptr = NULL; +static KeccakAddBytesFn *Keccak_AddBytes_ptr = NULL; +static KeccakPermuteFn *Keccak_Permute_ptr = NULL; +static KeccakExtractBytesFn *Keccak_ExtractBytes_ptr = NULL; +static KeccakFastLoopAbsorbFn *Keccak_FastLoopAbsorb_ptr = NULL; + +static void Keccak_Dispatch(void *state) { +// TODO: Simplify this when we have a Windows-compatible AVX2 implementation of SHA3 +#if defined(OQS_DIST_X86_64_BUILD) +#if defined(OQS_ENABLE_SHA3_xkcp_low_avx2) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { + Keccak_Initialize_ptr = &KeccakP1600_Initialize_avx2; + Keccak_AddByte_ptr = &KeccakP1600_AddByte_avx2; + Keccak_AddBytes_ptr = &KeccakP1600_AddBytes_avx2; + Keccak_Permute_ptr = &KeccakP1600_Permute_24rounds_avx2; + Keccak_ExtractBytes_ptr = &KeccakP1600_ExtractBytes_avx2; + Keccak_FastLoopAbsorb_ptr = &KeccakF1600_FastLoop_Absorb_avx2; + } else { + Keccak_Initialize_ptr = &KeccakP1600_Initialize_plain64; + Keccak_AddByte_ptr = &KeccakP1600_AddByte_plain64; + Keccak_AddBytes_ptr = &KeccakP1600_AddBytes_plain64; + Keccak_Permute_ptr = &KeccakP1600_Permute_24rounds_plain64; + Keccak_ExtractBytes_ptr = &KeccakP1600_ExtractBytes_plain64; + Keccak_FastLoopAbsorb_ptr = &KeccakF1600_FastLoop_Absorb_plain64; + } +#else // Windows + Keccak_Initialize_ptr = &KeccakP1600_Initialize_plain64; + Keccak_AddByte_ptr = &KeccakP1600_AddByte_plain64; + Keccak_AddBytes_ptr = &KeccakP1600_AddBytes_plain64; + Keccak_Permute_ptr = &KeccakP1600_Permute_24rounds_plain64; + Keccak_ExtractBytes_ptr = &KeccakP1600_ExtractBytes_plain64; + Keccak_FastLoopAbsorb_ptr = &KeccakF1600_FastLoop_Absorb_plain64; #endif +#else + Keccak_Initialize_ptr = &KeccakP1600_Initialize; + Keccak_AddByte_ptr = &KeccakP1600_AddByte; + Keccak_AddBytes_ptr = &KeccakP1600_AddBytes; + Keccak_Permute_ptr = &KeccakP1600_Permute_24rounds; + Keccak_ExtractBytes_ptr = &KeccakP1600_ExtractBytes; + Keccak_FastLoopAbsorb_ptr = &KeccakF1600_FastLoop_Absorb; +#endif + + (*Keccak_Initialize_ptr)(state); +} /************************************************* * Name: keccak_inc_reset @@ -44,7 +84,7 @@ * that have not been permuted, or not-yet-squeezed bytes. **************************************************/ static void keccak_inc_reset(uint64_t *s) { - KeccakF1600_Initialize(s); + (*Keccak_Initialize_ptr)(s); s[25] = 0; } @@ -68,8 +108,8 @@ static void keccak_inc_absorb(uint64_t *s, uint32_t r, const uint8_t *m, if (s[25] && mlen + s[25] >= r) { c = r - s[25]; - KeccakF1600_AddBytes(s, m, (unsigned int)s[25], (unsigned int)c); - KeccakF1600_StatePermute(s); + (*Keccak_AddBytes_ptr)(s, m, (unsigned int)s[25], (unsigned int)c); + (*Keccak_Permute_ptr)(s); mlen -= c; m += c; s[25] = 0; @@ -77,20 +117,20 @@ static void keccak_inc_absorb(uint64_t *s, uint32_t r, const uint8_t *m, #ifdef KeccakF1600_FastLoop_supported if (mlen >= r) { - c = KeccakF1600_FastLoop_Absorb(s, r / 8, m, mlen); + c = (*Keccak_FastLoop_Absorb_ptr)(s, r / 8, m, mlen); mlen -= c; m += c; } #else while (mlen >= r) { - KeccakF1600_AddBytes(s, m, 0, r); - KeccakF1600_StatePermute(s); + (*Keccak_AddBytes_ptr)(s, m, 0, r); + (*Keccak_Permute_ptr)(s); mlen -= r; m += r; } #endif - KeccakF1600_AddBytes(s, m, (unsigned int)s[25], (unsigned int)mlen); + (*Keccak_AddBytes_ptr)(s, m, (unsigned int)s[25], (unsigned int)mlen); s[25] += mlen; } @@ -110,8 +150,8 @@ static void keccak_inc_absorb(uint64_t *s, uint32_t r, const uint8_t *m, static void keccak_inc_finalize(uint64_t *s, uint32_t r, uint8_t p) { /* After keccak_inc_absorb, we are guaranteed that s[25] < r, so we can always use one more byte for p in the current state. */ - KeccakF1600_AddByte(s, p, (unsigned int)s[25]); - KeccakF1600_AddByte(s, 0x80, (unsigned int)(r - 1)); + (*Keccak_AddByte_ptr)(s, p, (unsigned int)s[25]); + (*Keccak_AddByte_ptr)(s, 0x80, (unsigned int)(r - 1)); s[25] = 0; } @@ -131,13 +171,13 @@ static void keccak_inc_finalize(uint64_t *s, uint32_t r, uint8_t p) { static void keccak_inc_squeeze(uint8_t *h, size_t outlen, uint64_t *s, uint32_t r) { while (outlen > s[25]) { - KeccakF1600_ExtractBytes(s, h, (unsigned int)(r - s[25]), (unsigned int)s[25]); - KeccakF1600_StatePermute(s); + (*Keccak_ExtractBytes_ptr)(s, h, (unsigned int)(r - s[25]), (unsigned int)s[25]); + (*Keccak_Permute_ptr)(s); h += s[25]; outlen -= s[25]; s[25] = r; } - KeccakF1600_ExtractBytes(s, h, (unsigned int)(r - s[25]), (unsigned int)outlen); + (*Keccak_ExtractBytes_ptr)(s, h, (unsigned int)(r - s[25]), (unsigned int)outlen); s[25] -= outlen; } diff --git a/src/common/sha3/xkcp_sha3x4.c b/src/common/sha3/xkcp_sha3x4.c index 3f1efbbc7..2a6d404f9 100644 --- a/src/common/sha3/xkcp_sha3x4.c +++ b/src/common/sha3/xkcp_sha3x4.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: MIT -#include "KeccakP-1600-times4-SnP.h" #include "sha3.h" #include "sha3x4.h" +#include "xkcp_dispatch.h" + #include #include @@ -12,25 +13,58 @@ #include #include -#define KECCAK_X4_CTX_ALIGNMENT KeccakP1600times4_statesAlignment - -#if KeccakP1600times4_statesSizeInBytes == 800 +#define KECCAK_X4_CTX_ALIGNMENT 32 #define _KECCAK_X4_CTX_BYTES (800+sizeof(uint64_t)) -// Round up to a multiple of alignment for C11 aligned_alloc #define KECCAK_X4_CTX_BYTES (KECCAK_X4_CTX_ALIGNMENT * \ ((_KECCAK_X4_CTX_BYTES + KECCAK_X4_CTX_ALIGNMENT - 1)/KECCAK_X4_CTX_ALIGNMENT)) + +/* The first call to Keccak_Initialize will be routed through dispatch, which + * updates all of the function pointers used below. + */ +static KeccakX4InitFn Keccak_X4_Dispatch; +static KeccakX4InitFn *Keccak_X4_Initialize_ptr = &Keccak_X4_Dispatch; +static KeccakX4AddByteFn *Keccak_X4_AddByte_ptr = NULL; +static KeccakX4AddBytesFn *Keccak_X4_AddBytes_ptr = NULL; +static KeccakX4PermuteFn *Keccak_X4_Permute_ptr = NULL; +static KeccakX4ExtractBytesFn *Keccak_X4_ExtractBytes_ptr = NULL; + +static void Keccak_X4_Dispatch(void *state) { +// TODO: Simplify this when we have a Windows-compatible AVX2 implementation of SHA3 +#if defined(OQS_DIST_X86_64_BUILD) +#if defined(OQS_ENABLE_SHA3_xkcp_low_avx2) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { + Keccak_X4_Initialize_ptr = &KeccakP1600times4_InitializeAll_avx2; + Keccak_X4_AddByte_ptr = &KeccakP1600times4_AddByte_avx2; + Keccak_X4_AddBytes_ptr = &KeccakP1600times4_AddBytes_avx2; + Keccak_X4_Permute_ptr = &KeccakP1600times4_PermuteAll_24rounds_avx2; + Keccak_X4_ExtractBytes_ptr = &KeccakP1600times4_ExtractBytes_avx2; + } else { + Keccak_X4_Initialize_ptr = &KeccakP1600times4_InitializeAll_serial; + Keccak_X4_AddByte_ptr = &KeccakP1600times4_AddByte_serial; + Keccak_X4_AddBytes_ptr = &KeccakP1600times4_AddBytes_serial; + Keccak_X4_Permute_ptr = &KeccakP1600times4_PermuteAll_24rounds_serial; + Keccak_X4_ExtractBytes_ptr = &KeccakP1600times4_ExtractBytes_serial; + } +#else // Windows + Keccak_X4_Initialize_ptr = &KeccakP1600times4_InitializeAll_serial; + Keccak_X4_AddByte_ptr = &KeccakP1600times4_AddByte_serial; + Keccak_X4_AddBytes_ptr = &KeccakP1600times4_AddBytes_serial; + Keccak_X4_Permute_ptr = &KeccakP1600times4_PermuteAll_24rounds_serial; + Keccak_X4_ExtractBytes_ptr = &KeccakP1600times4_ExtractBytes_serial; +#endif #else -#error sha3x4_xkcp assumes 800 byte KeccakP1600times4 state + Keccak_X4_Initialize_ptr = &KeccakP1600times4_InitializeAll; + Keccak_X4_AddByte_ptr = &KeccakP1600times4_AddByte; + Keccak_X4_AddBytes_ptr = &KeccakP1600times4_AddBytes; + Keccak_X4_Permute_ptr = &KeccakP1600times4_PermuteAll_24rounds; + Keccak_X4_ExtractBytes_ptr = &KeccakP1600times4_ExtractBytes; #endif -#define KeccakF1600times4_InitializeAll KeccakP1600times4_InitializeAll -#define KeccakF1600times4_ExtractBytes KeccakP1600times4_ExtractBytes -#define KeccakF1600times4_AddByte KeccakP1600times4_AddByte -#define KeccakF1600times4_AddBytes KeccakP1600times4_AddBytes -#define KeccakF1600times4_StatePermuteAll KeccakP1600times4_PermuteAll_24rounds + (*Keccak_X4_Initialize_ptr)(state); +} static void keccak_x4_inc_reset(uint64_t *s) { - KeccakF1600times4_InitializeAll(s); + (*Keccak_X4_Initialize_ptr)(s); s[100] = 0; } @@ -40,11 +74,11 @@ static void keccak_x4_inc_absorb(uint64_t *s, uint32_t r, if (s[100] && inlen + s[100] >= r) { c = r - s[100]; - KeccakF1600times4_AddBytes(s, 0, in0, (unsigned int)s[100], (unsigned int)c); - KeccakF1600times4_AddBytes(s, 1, in1, (unsigned int)s[100], (unsigned int)c); - KeccakF1600times4_AddBytes(s, 2, in2, (unsigned int)s[100], (unsigned int)c); - KeccakF1600times4_AddBytes(s, 3, in3, (unsigned int)s[100], (unsigned int)c); - KeccakF1600times4_StatePermuteAll(s); + (*Keccak_X4_AddBytes_ptr)(s, 0, in0, (unsigned int)s[100], (unsigned int)c); + (*Keccak_X4_AddBytes_ptr)(s, 1, in1, (unsigned int)s[100], (unsigned int)c); + (*Keccak_X4_AddBytes_ptr)(s, 2, in2, (unsigned int)s[100], (unsigned int)c); + (*Keccak_X4_AddBytes_ptr)(s, 3, in3, (unsigned int)s[100], (unsigned int)c); + (*Keccak_X4_Permute_ptr)(s); inlen -= c; in0 += c; in1 += c; @@ -54,11 +88,11 @@ static void keccak_x4_inc_absorb(uint64_t *s, uint32_t r, } while (inlen >= r) { - KeccakF1600times4_AddBytes(s, 0, in0, 0, (unsigned int)r); - KeccakF1600times4_AddBytes(s, 1, in1, 0, (unsigned int)r); - KeccakF1600times4_AddBytes(s, 2, in2, 0, (unsigned int)r); - KeccakF1600times4_AddBytes(s, 3, in3, 0, (unsigned int)r); - KeccakF1600times4_StatePermuteAll(s); + (*Keccak_X4_AddBytes_ptr)(s, 0, in0, 0, (unsigned int)r); + (*Keccak_X4_AddBytes_ptr)(s, 1, in1, 0, (unsigned int)r); + (*Keccak_X4_AddBytes_ptr)(s, 2, in2, 0, (unsigned int)r); + (*Keccak_X4_AddBytes_ptr)(s, 3, in3, 0, (unsigned int)r); + (*Keccak_X4_Permute_ptr)(s); inlen -= r; in0 += r; in1 += r; @@ -66,23 +100,23 @@ static void keccak_x4_inc_absorb(uint64_t *s, uint32_t r, in3 += r; } - KeccakF1600times4_AddBytes(s, 0, in0, (unsigned int)s[100], (unsigned int)inlen); - KeccakF1600times4_AddBytes(s, 1, in1, (unsigned int)s[100], (unsigned int)inlen); - KeccakF1600times4_AddBytes(s, 2, in2, (unsigned int)s[100], (unsigned int)inlen); - KeccakF1600times4_AddBytes(s, 3, in3, (unsigned int)s[100], (unsigned int)inlen); + (*Keccak_X4_AddBytes_ptr)(s, 0, in0, (unsigned int)s[100], (unsigned int)inlen); + (*Keccak_X4_AddBytes_ptr)(s, 1, in1, (unsigned int)s[100], (unsigned int)inlen); + (*Keccak_X4_AddBytes_ptr)(s, 2, in2, (unsigned int)s[100], (unsigned int)inlen); + (*Keccak_X4_AddBytes_ptr)(s, 3, in3, (unsigned int)s[100], (unsigned int)inlen); s[100] += inlen; } static void keccak_x4_inc_finalize(uint64_t *s, uint32_t r, uint8_t p) { - KeccakF1600times4_AddByte(s, 0, p, (unsigned int)s[100]); - KeccakF1600times4_AddByte(s, 1, p, (unsigned int)s[100]); - KeccakF1600times4_AddByte(s, 2, p, (unsigned int)s[100]); - KeccakF1600times4_AddByte(s, 3, p, (unsigned int)s[100]); + (*Keccak_X4_AddByte_ptr)(s, 0, p, (unsigned int)s[100]); + (*Keccak_X4_AddByte_ptr)(s, 1, p, (unsigned int)s[100]); + (*Keccak_X4_AddByte_ptr)(s, 2, p, (unsigned int)s[100]); + (*Keccak_X4_AddByte_ptr)(s, 3, p, (unsigned int)s[100]); - KeccakF1600times4_AddByte(s, 0, 0x80, (unsigned int)(r - 1)); - KeccakF1600times4_AddByte(s, 1, 0x80, (unsigned int)(r - 1)); - KeccakF1600times4_AddByte(s, 2, 0x80, (unsigned int)(r - 1)); - KeccakF1600times4_AddByte(s, 3, 0x80, (unsigned int)(r - 1)); + (*Keccak_X4_AddByte_ptr)(s, 0, 0x80, (unsigned int)(r - 1)); + (*Keccak_X4_AddByte_ptr)(s, 1, 0x80, (unsigned int)(r - 1)); + (*Keccak_X4_AddByte_ptr)(s, 2, 0x80, (unsigned int)(r - 1)); + (*Keccak_X4_AddByte_ptr)(s, 3, 0x80, (unsigned int)(r - 1)); s[100] = 0; } @@ -91,11 +125,11 @@ static void keccak_x4_inc_squeeze(uint8_t *out0, uint8_t *out1, uint8_t *out2, u size_t outlen, uint64_t *s, uint32_t r) { while (outlen > s[100]) { - KeccakF1600times4_ExtractBytes(s, 0, out0, (unsigned int)(r - s[100]), (unsigned int)s[100]); - KeccakF1600times4_ExtractBytes(s, 1, out1, (unsigned int)(r - s[100]), (unsigned int)s[100]); - KeccakF1600times4_ExtractBytes(s, 2, out2, (unsigned int)(r - s[100]), (unsigned int)s[100]); - KeccakF1600times4_ExtractBytes(s, 3, out3, (unsigned int)(r - s[100]), (unsigned int)s[100]); - KeccakF1600times4_StatePermuteAll(s); + (*Keccak_X4_ExtractBytes_ptr)(s, 0, out0, (unsigned int)(r - s[100]), (unsigned int)s[100]); + (*Keccak_X4_ExtractBytes_ptr)(s, 1, out1, (unsigned int)(r - s[100]), (unsigned int)s[100]); + (*Keccak_X4_ExtractBytes_ptr)(s, 2, out2, (unsigned int)(r - s[100]), (unsigned int)s[100]); + (*Keccak_X4_ExtractBytes_ptr)(s, 3, out3, (unsigned int)(r - s[100]), (unsigned int)s[100]); + (*Keccak_X4_Permute_ptr)(s); out0 += s[100]; out1 += s[100]; out2 += s[100]; @@ -104,10 +138,10 @@ static void keccak_x4_inc_squeeze(uint8_t *out0, uint8_t *out1, uint8_t *out2, u s[100] = r; } - KeccakF1600times4_ExtractBytes(s, 0, out0, (unsigned int)(r - s[100]), (unsigned int)outlen); - KeccakF1600times4_ExtractBytes(s, 1, out1, (unsigned int)(r - s[100]), (unsigned int)outlen); - KeccakF1600times4_ExtractBytes(s, 2, out2, (unsigned int)(r - s[100]), (unsigned int)outlen); - KeccakF1600times4_ExtractBytes(s, 3, out3, (unsigned int)(r - s[100]), (unsigned int)outlen); + (*Keccak_X4_ExtractBytes_ptr)(s, 0, out0, (unsigned int)(r - s[100]), (unsigned int)outlen); + (*Keccak_X4_ExtractBytes_ptr)(s, 1, out1, (unsigned int)(r - s[100]), (unsigned int)outlen); + (*Keccak_X4_ExtractBytes_ptr)(s, 2, out2, (unsigned int)(r - s[100]), (unsigned int)outlen); + (*Keccak_X4_ExtractBytes_ptr)(s, 3, out3, (unsigned int)(r - s[100]), (unsigned int)outlen); s[100] -= outlen; }