diff --git a/.CMake/compiler_opts.cmake b/.CMake/compiler_opts.cmake index a648303fc..baf593dd3 100644 --- a/.CMake/compiler_opts.cmake +++ b/.CMake/compiler_opts.cmake @@ -1,5 +1,8 @@ # SPDX-License-Identifier: MIT +option(OQS_PORTABLE_BUILD "Ensure the resulting library is portable. This implies having run-time checks for CPU extensions." ON) +option(OQS_BUILD_ONLY_LIB "Build only liboqs and do not expose build targets for tests, documentation, and pretty-printing available." OFF) + if(CMAKE_C_COMPILER_ID MATCHES "Clang") add_compile_options(-Werror) add_compile_options(-Wall) @@ -13,6 +16,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang") set(OQS_USE_PTHREADS_IN_TESTS 1) endif() + option(OQS_USE_CPU_EXTENSIONS "Enable compile and run-time support for CPU extensions such as AVX2, SSE, etc." ON) if(OQS_USE_CPU_EXTENSIONS) include(${CMAKE_CURRENT_LIST_DIR}/gcc_clang_intrinsics.cmake) endif() @@ -67,6 +71,7 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU") set(OQS_USE_PTHREADS_IN_TESTS 1) endif() + option(OQS_USE_CPU_EXTENSIONS "Enable compile and run-time support for CPU extensions such as AVX2, SSE, etc." ON) if(OQS_USE_CPU_EXTENSIONS) include(${CMAKE_CURRENT_LIST_DIR}/gcc_clang_intrinsics.cmake) endif() diff --git a/CMakeLists.txt b/CMakeLists.txt index a26247176..7ad2707b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,14 +35,9 @@ endif() if(WIN32) set(CMAKE_GENERATOR_CC cl) endif() - -option(OQS_USE_CPU_EXTENSIONS "Enable compile and run-time support for CPU extensions such as AVX2, SSE, etc." ON) -option(OQS_PORTABLE_BUILD "Ensure the resulting library is portable. This implies having run-time checks for CPU extensions." ON) -option(OQS_BUILD_ONLY_LIB "Build only liboqs and do not expose build targets for tests, documentation, and pretty-printing available." OFF) include(.CMake/compiler_opts.cmake) include(.CMake/alg_support.cmake) - if(OQS_USE_OPENSSL) if(NOT DEFINED OPENSSL_ROOT_DIR) if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Darwin") diff --git a/README.md b/README.md index f699d520f..b9f7ebf6a 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,7 @@ liboqs includes some third party libraries or modules that are licensed differen - `.CMake/CMakeDependentOption.cmake`: BSD 3-Clause License - `src/common/common.c`: includes portions which are Apache License v2.0 - `src/common/crypto/aes/aes_c.c`: public domain or any OSI-approved license +- `src/common/crypto/aes/aes*_ni.c`: public domain - `src/common/crypto/sha2/sha2_c.c`: public domain - `src/common/crypto/sha3/fips202.c`: public domain - `src/common/crypto/sha3/keccak4x`: CC0 (public domain), except `brg_endian.h` diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index c0dcd3706..6ba925e3c 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -13,7 +13,13 @@ endif() if(OQS_USE_AES_OPENSSL) set(AES_IMPL aes/aes_ossl.c) else() - set(AES_IMPL aes/aes_c.c) + set(AES_IMPL aes/aes.c aes/aes_c.c) + if (OQS_USE_AES_INSTRUCTIONS) + set(AES_IMPL ${AES_IMPL} aes/aes128_ni.c) + set(AES_IMPL ${AES_IMPL} aes/aes256_ni.c) + set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS -maes) + set_source_files_properties(aes/aes256_ni.c PROPERTIES COMPILE_FLAGS -maes) + endif() endif() if(OQS_USE_SHA2_OPENSSL) diff --git a/src/common/aes/aes.c b/src/common/aes/aes.c new file mode 100644 index 000000000..e6df793ce --- /dev/null +++ b/src/common/aes/aes.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: MIT + +#include +#include +#include + +#include + +#include "aes.h" +#include "aes_local.h" + +#if defined(OQS_USE_CPU_EXTENSIONS) && defined(OQS_PORTABLE_BUILD) +#define C_OR_NI(stmt_c, stmt_ni) \ + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); \ + if (available_cpu_extensions.AES_ENABLED) { \ + stmt_ni; \ + } else { \ + stmt_c; \ + } +#elif defined(OQS_USE_CPU_EXTENSIONS) /* && !defined(OQS_PORTABLE_BUILD) */ +#define C_OR_NI(stmt_c, stmt_ni) \ + stmt_ni; +#else /* !defined(OQS_USE_CPU_EXTENSIONS) */ +#define C_OR_NI(stmt_c, stmt_ni) \ + stmt_c; +#endif + +void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED int for_encryption) { + C_OR_NI( + oqs_aes128_load_schedule_c(key, _schedule, for_encryption), + oqs_aes128_load_schedule_ni(key, _schedule) + ) +} + +void OQS_AES128_free_schedule(void *schedule) { + C_OR_NI( + oqs_aes128_free_schedule_c(schedule), + oqs_aes128_free_schedule_ni(schedule) + ) +} + +void OQS_AES256_ECB_load_schedule(const uint8_t *key, void **_schedule, int for_encryption) { + C_OR_NI( + oqs_aes256_load_schedule_c(key, _schedule, for_encryption), + oqs_aes256_load_schedule_ni(key, _schedule, for_encryption) + ) +} + +void OQS_AES256_CTR_load_schedule(const uint8_t *key, void **_schedule) { + OQS_AES256_ECB_load_schedule(key, _schedule, 1); +} + +void OQS_AES256_free_schedule(void *schedule) { + C_OR_NI( + oqs_aes256_free_schedule_c(schedule), + oqs_aes256_free_schedule_ni(schedule) + ) +} + +void OQS_AES128_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) { + void *schedule = NULL; + OQS_AES128_ECB_load_schedule(key, &schedule, 1); + OQS_AES128_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext); + OQS_AES128_free_schedule(schedule); +} + +inline void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { + assert(plaintext_len % 16 == 0); + for (size_t block = 0; block < plaintext_len / 16; block++) { + oqs_aes128_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block)); + } +} + +inline void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { + assert(plaintext_len % 16 == 0); + for (size_t block = 0; block < plaintext_len / 16; block++) { + oqs_aes128_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block)); + } +} + +void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { + C_OR_NI( + oqs_aes128_ecb_enc_sch_c(plaintext, plaintext_len, schedule, ciphertext), + oqs_aes128_ecb_enc_sch_ni(plaintext, plaintext_len, schedule, ciphertext) + ) +} + +inline void oqs_aes128_ecb_dec_sch_c(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { + assert(ciphertext_len % 16 == 0); + for (size_t block = 0; block < ciphertext_len / 16; block++) { + oqs_aes128_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block)); + } +} + +inline void oqs_aes128_ecb_dec_sch_ni(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { + assert(ciphertext_len % 16 == 0); + for (size_t block = 0; block < ciphertext_len / 16; block++) { + oqs_aes128_dec_sch_block_ni(ciphertext + (16 * block), schedule, plaintext + (16 * block)); + } +} + +void OQS_AES128_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) { + void *schedule = NULL; + OQS_AES128_ECB_load_schedule(key, &schedule, 0); + OQS_AES128_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext); + OQS_AES128_free_schedule(schedule); +} + +void OQS_AES128_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { + C_OR_NI( + oqs_aes128_ecb_dec_sch_c(ciphertext, ciphertext_len, schedule, plaintext), + oqs_aes128_ecb_dec_sch_ni(ciphertext, ciphertext_len, schedule, plaintext) + ) +} + +void OQS_AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) { + void *schedule = NULL; + OQS_AES256_ECB_load_schedule(key, &schedule, 1); + OQS_AES256_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext); + OQS_AES256_free_schedule(schedule); +} + +inline void oqs_aes256_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { + assert(plaintext_len % 16 == 0); + for (size_t block = 0; block < plaintext_len / 16; block++) { + oqs_aes256_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block)); + } +} + +inline void oqs_aes256_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { + assert(plaintext_len % 16 == 0); + for (size_t block = 0; block < plaintext_len / 16; block++) { + oqs_aes256_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block)); + } +} + +void OQS_AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { + C_OR_NI( + oqs_aes256_ecb_enc_sch_c(plaintext, plaintext_len, schedule, ciphertext), + oqs_aes256_ecb_enc_sch_ni(plaintext, plaintext_len, schedule, ciphertext) + ) +} + +void OQS_AES256_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) { + void *schedule = NULL; + OQS_AES256_ECB_load_schedule(key, &schedule, 0); + OQS_AES256_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext); + OQS_AES256_free_schedule(schedule); +} + +inline void oqs_aes256_ecb_dec_sch_c(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { + assert(ciphertext_len % 16 == 0); + for (size_t block = 0; block < ciphertext_len / 16; block++) { + oqs_aes256_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block)); + } +} + +inline void oqs_aes256_ecb_dec_sch_ni(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { + assert(ciphertext_len % 16 == 0); + for (size_t block = 0; block < ciphertext_len / 16; block++) { + oqs_aes256_dec_sch_block_ni(ciphertext + (16 * block), schedule, plaintext + (16 * block)); + } +} + +void OQS_AES256_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { + C_OR_NI( + oqs_aes256_ecb_dec_sch_c(ciphertext, ciphertext_len, schedule, plaintext), + oqs_aes256_ecb_dec_sch_ni(ciphertext, ciphertext_len, schedule, plaintext) + ) +} + +static inline uint32_t UINT32_TO_BE(const uint32_t x) { + union { + uint32_t val; + uint8_t bytes[4]; + } y; + y.bytes[0] = (x >> 24) & 0xFF; + y.bytes[1] = (x >> 16) & 0xFF; + y.bytes[2] = (x >> 8) & 0xFF; + y.bytes[3] = x & 0xFF; + return y.val; +} +#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0)) + +void OQS_AES256_CTR_sch(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { + uint8_t block[16]; + uint32_t ctr; + uint32_t ctr_be; + memcpy(block, iv, 12); + if (iv_len == 12) { + ctr = 0; + } else if (iv_len == 16) { + memcpy(&ctr_be, &iv[12], 4); + ctr = BE_TO_UINT32(ctr_be); + } else { + exit(EXIT_FAILURE); + } + while (out_len >= 16) { + ctr_be = UINT32_TO_BE(ctr); + memcpy(&block[12], (uint8_t *) &ctr_be, 4); + C_OR_NI( + oqs_aes256_enc_sch_block_c(block, schedule, out), + oqs_aes256_enc_sch_block_ni(block, schedule, out) + ) + out += 16; + out_len -= 16; + ctr++; + } + if (out_len > 0) { + uint8_t tmp[16]; + ctr_be = UINT32_TO_BE(ctr); + memcpy(&block[12], (uint8_t *) &ctr_be, 4); + C_OR_NI( + oqs_aes256_enc_sch_block_c(block, schedule, tmp), + oqs_aes256_enc_sch_block_ni(block, schedule, tmp) + ) + memcpy(out, tmp, out_len); + } +} diff --git a/src/common/aes/aes128_ni.c b/src/common/aes/aes128_ni.c new file mode 100644 index 000000000..b655235bb --- /dev/null +++ b/src/common/aes/aes128_ni.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Public domain +// Based on public domain code by Romain Dolbeau +// http://dolbeau.name/dolbeau/crypto/crypto.html + +#include +#include +#include +#include + +#include + +// From crypto_core/aes128ncrypt/dolbeau/aesenc-int +static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[11]) { + __m128i key0 = _mm_loadu_si128((const __m128i_u *)(key + 0)); + __m128i temp0, temp1, temp4; + int idx = 0; + + temp0 = key0; + + /* blockshift-based block by Cedric Bourrasset */ +#define BLOCK1(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ + rkeys[idx++] = temp0; \ + temp4 = _mm_slli_si128(temp0,4); \ + temp0 = _mm_xor_si128(temp0,temp4); \ + temp4 = _mm_slli_si128(temp0,8); \ + temp0 = _mm_xor_si128(temp0,temp4); \ + temp1 = _mm_shuffle_epi32(temp1,0xff); \ + temp0 = _mm_xor_si128(temp0,temp1) + + BLOCK1(0x01); + BLOCK1(0x02); + BLOCK1(0x04); + BLOCK1(0x08); + BLOCK1(0x10); + BLOCK1(0x20); + BLOCK1(0x40); + BLOCK1(0x80); + BLOCK1(0x1b); + BLOCK1(0x36); + rkeys[idx++] = temp0; +} + +// From crypto_core/aes128decrypt/dolbeau/aesenc-int +static inline void aes128ni_setkey_decrypt(const unsigned char *key, __m128i rkeys[11]) { + __m128i tkeys[11]; + aes128ni_setkey_encrypt(key, tkeys); + rkeys[0] = tkeys[10]; + rkeys[1] = _mm_aesimc_si128(tkeys[9]); + rkeys[2] = _mm_aesimc_si128(tkeys[8]); + rkeys[3] = _mm_aesimc_si128(tkeys[7]); + rkeys[4] = _mm_aesimc_si128(tkeys[6]); + rkeys[5] = _mm_aesimc_si128(tkeys[5]); + rkeys[6] = _mm_aesimc_si128(tkeys[4]); + rkeys[7] = _mm_aesimc_si128(tkeys[3]); + rkeys[8] = _mm_aesimc_si128(tkeys[2]); + rkeys[9] = _mm_aesimc_si128(tkeys[1]); + rkeys[10] = tkeys[0]; +} + +void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule, int for_encryption) { + *_schedule = malloc(11 * sizeof(__m128i)); + assert(*_schedule != NULL); + __m128i *schedule = (__m128i *) *_schedule; + if (for_encryption) { + aes128ni_setkey_encrypt(key, schedule); + } else { + aes128ni_setkey_decrypt(key, schedule); + } +} + +void oqs_aes128_free_schedule_ni(void *schedule) { + if (schedule != NULL) { + OQS_MEM_secure_free(schedule, 11 * sizeof(__m128i)); + } +} + +// From crypto_core/aes128encrypt/dolbeau/aesenc-int +static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char *n, unsigned char *out) { + __m128i nv = _mm_load_si128((const __m128i *)n); + __m128i temp = _mm_xor_si128(nv, rkeys[0]); + temp = _mm_aesenc_si128(temp, rkeys[1]); + temp = _mm_aesenc_si128(temp, rkeys[2]); + temp = _mm_aesenc_si128(temp, rkeys[3]); + temp = _mm_aesenc_si128(temp, rkeys[4]); + temp = _mm_aesenc_si128(temp, rkeys[5]); + temp = _mm_aesenc_si128(temp, rkeys[6]); + temp = _mm_aesenc_si128(temp, rkeys[7]); + temp = _mm_aesenc_si128(temp, rkeys[8]); + temp = _mm_aesenc_si128(temp, rkeys[9]); + temp = _mm_aesenclast_si128(temp, rkeys[10]); + _mm_store_si128((__m128i *)(out), temp); +} + +void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) { + const __m128i *schedule = (const __m128i *) _schedule; + aes128ni_encrypt(schedule, plaintext, ciphertext); +} + +// From crypto_core/aes128decrypt/dolbeau/aesenc-int +static inline void aes128ni_decrypt(const __m128i rkeys[11], const unsigned char *n, unsigned char *out) { + __m128i nv = _mm_load_si128((const __m128i *)n); + __m128i temp = _mm_xor_si128(nv, rkeys[0]); + temp = _mm_aesdec_si128(temp, rkeys[1]); + temp = _mm_aesdec_si128(temp, rkeys[2]); + temp = _mm_aesdec_si128(temp, rkeys[3]); + temp = _mm_aesdec_si128(temp, rkeys[4]); + temp = _mm_aesdec_si128(temp, rkeys[5]); + temp = _mm_aesdec_si128(temp, rkeys[6]); + temp = _mm_aesdec_si128(temp, rkeys[7]); + temp = _mm_aesdec_si128(temp, rkeys[8]); + temp = _mm_aesdec_si128(temp, rkeys[9]); + temp = _mm_aesdeclast_si128(temp, rkeys[10]); + _mm_store_si128((__m128i *)(out), temp); +} + +void oqs_aes128_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext) { + const __m128i *schedule = (const __m128i *) _schedule; + aes128ni_decrypt(schedule, ciphertext, plaintext); +} diff --git a/src/common/aes/aes256_ni.c b/src/common/aes/aes256_ni.c new file mode 100644 index 000000000..0e95c380c --- /dev/null +++ b/src/common/aes/aes256_ni.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: Public domain +// Based on public domain code by Romain Dolbeau +// http://dolbeau.name/dolbeau/crypto/crypto.html + + +#include +#include +#include +#include + +#include + +// From crypto_core/aes256encrypt/dolbeau/aesenc-int +static inline void aes256ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[15]) { + __m128i key0 = _mm_loadu_si128((const __m128i_u *)(key + 0)); + __m128i key1 = _mm_loadu_si128((const __m128i_u *)(key + 16)); + __m128i temp0, temp1, temp2, temp4; + int idx = 0; + + rkeys[idx++] = key0; + temp0 = key0; + temp2 = key1; + + /* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */ +#define BLOCK1(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \ + rkeys[idx++] = temp2; \ + temp4 = _mm_slli_si128(temp0,4); \ + temp0 = _mm_xor_si128(temp0,temp4); \ + temp4 = _mm_slli_si128(temp0,8); \ + temp0 = _mm_xor_si128(temp0,temp4); \ + temp1 = _mm_shuffle_epi32(temp1,0xff); \ + temp0 = _mm_xor_si128(temp0,temp1) + +#define BLOCK2(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ + rkeys[idx++] = temp0; \ + temp4 = _mm_slli_si128(temp2,4); \ + temp2 = _mm_xor_si128(temp2,temp4); \ + temp4 = _mm_slli_si128(temp2,8); \ + temp2 = _mm_xor_si128(temp2,temp4); \ + temp1 = _mm_shuffle_epi32(temp1,0xaa); \ + temp2 = _mm_xor_si128(temp2,temp1) + + BLOCK1(0x01); + BLOCK2(0x01); + + BLOCK1(0x02); + BLOCK2(0x02); + + BLOCK1(0x04); + BLOCK2(0x04); + + BLOCK1(0x08); + BLOCK2(0x08); + + BLOCK1(0x10); + BLOCK2(0x10); + + BLOCK1(0x20); + BLOCK2(0x20); + + BLOCK1(0x40); + rkeys[idx++] = temp0; +} + +// From crypto_core/aes256decrypt/dolbeau/aesenc-int +static inline void aes256ni_setkey_decrypt(const unsigned char *key, __m128i rkeys[15]) { + __m128i tkeys[15]; + aes256ni_setkey_encrypt(key, tkeys); + rkeys[0] = tkeys[14]; + rkeys[1] = _mm_aesimc_si128(tkeys[13]); + rkeys[2] = _mm_aesimc_si128(tkeys[12]); + rkeys[3] = _mm_aesimc_si128(tkeys[11]); + rkeys[4] = _mm_aesimc_si128(tkeys[10]); + rkeys[5] = _mm_aesimc_si128(tkeys[9]); + rkeys[6] = _mm_aesimc_si128(tkeys[8]); + rkeys[7] = _mm_aesimc_si128(tkeys[7]); + rkeys[8] = _mm_aesimc_si128(tkeys[6]); + rkeys[9] = _mm_aesimc_si128(tkeys[5]); + rkeys[10] = _mm_aesimc_si128(tkeys[4]); + rkeys[11] = _mm_aesimc_si128(tkeys[3]); + rkeys[12] = _mm_aesimc_si128(tkeys[2]); + rkeys[13] = _mm_aesimc_si128(tkeys[1]); + rkeys[14] = tkeys[0]; +} + +void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule, int for_encryption) { + *_schedule = malloc(15 * sizeof(__m128i)); + assert(*_schedule != NULL); + __m128i *schedule = (__m128i *) *_schedule; + if (for_encryption) { + aes256ni_setkey_encrypt(key, schedule); + } else { + aes256ni_setkey_decrypt(key, schedule); + } +} + +void oqs_aes256_free_schedule_ni(void *schedule) { + if (schedule != NULL) { + OQS_MEM_secure_free(schedule, 15 * sizeof(__m128i)); + } +} + +// From crypto_core/aes256encrypt/dolbeau/aesenc-int +static inline void aes256ni_encrypt(const __m128i rkeys[15], const unsigned char *n, unsigned char *out) { + __m128i nv = _mm_load_si128((const __m128i *)n); + __m128i temp = _mm_xor_si128(nv, rkeys[0]); + temp = _mm_aesenc_si128(temp, rkeys[1]); + temp = _mm_aesenc_si128(temp, rkeys[2]); + temp = _mm_aesenc_si128(temp, rkeys[3]); + temp = _mm_aesenc_si128(temp, rkeys[4]); + temp = _mm_aesenc_si128(temp, rkeys[5]); + temp = _mm_aesenc_si128(temp, rkeys[6]); + temp = _mm_aesenc_si128(temp, rkeys[7]); + temp = _mm_aesenc_si128(temp, rkeys[8]); + temp = _mm_aesenc_si128(temp, rkeys[9]); + temp = _mm_aesenc_si128(temp, rkeys[10]); + temp = _mm_aesenc_si128(temp, rkeys[11]); + temp = _mm_aesenc_si128(temp, rkeys[12]); + temp = _mm_aesenc_si128(temp, rkeys[13]); + temp = _mm_aesenclast_si128(temp, rkeys[14]); + _mm_store_si128((__m128i *)(out), temp); +} + +void oqs_aes256_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) { + const __m128i *schedule = (const __m128i *) _schedule; + aes256ni_encrypt(schedule, plaintext, ciphertext); +} + +// From crypto_core/aes256decrypt/dolbeau/aesenc-int +static inline void aes256ni_decrypt(const __m128i rkeys[15], const unsigned char *n, unsigned char *out) { + __m128i nv = _mm_load_si128((const __m128i *)n); + __m128i temp = _mm_xor_si128(nv, rkeys[0]); + temp = _mm_aesdec_si128(temp, rkeys[1]); + temp = _mm_aesdec_si128(temp, rkeys[2]); + temp = _mm_aesdec_si128(temp, rkeys[3]); + temp = _mm_aesdec_si128(temp, rkeys[4]); + temp = _mm_aesdec_si128(temp, rkeys[5]); + temp = _mm_aesdec_si128(temp, rkeys[6]); + temp = _mm_aesdec_si128(temp, rkeys[7]); + temp = _mm_aesdec_si128(temp, rkeys[8]); + temp = _mm_aesdec_si128(temp, rkeys[9]); + temp = _mm_aesdec_si128(temp, rkeys[10]); + temp = _mm_aesdec_si128(temp, rkeys[11]); + temp = _mm_aesdec_si128(temp, rkeys[12]); + temp = _mm_aesdec_si128(temp, rkeys[13]); + temp = _mm_aesdeclast_si128(temp, rkeys[14]); + _mm_store_si128((__m128i *)(out), temp); +} + +void oqs_aes256_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext) { + const __m128i *schedule = (const __m128i *) _schedule; + aes256ni_decrypt(schedule, ciphertext, plaintext); +} diff --git a/src/common/aes/aes_c.c b/src/common/aes/aes_c.c index c075faef8..2678c8a0f 100644 --- a/src/common/aes/aes_c.c +++ b/src/common/aes/aes_c.c @@ -220,7 +220,7 @@ static void key_schedule_core(byte *a, int i) { // Expand the 16-byte key to 11 round keys (176 bytes) // http://en.wikipedia.org/wiki/Rijndael_key_schedule#The_key_schedule -void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED int for_encryption) { +void oqs_aes128_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption) { *_schedule = malloc(16 * 11); OQS_EXIT_IF_NULLPTR(*_schedule); uint8_t *schedule = (uint8_t *) *_schedule; @@ -248,7 +248,7 @@ void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED i } } -void OQS_AES128_free_schedule(void *schedule) { +void oqs_aes128_free_schedule_c(void *schedule) { if (schedule != NULL) { OQS_MEM_secure_free(schedule, 176); } @@ -256,7 +256,7 @@ void OQS_AES128_free_schedule(void *schedule) { // Expand the 16-byte key to 15 round keys (240 bytes) // http://en.wikipedia.org/wiki/Rijndael_key_schedule#The_key_schedule -void OQS_AES256_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED int for_encryption) { +void oqs_aes256_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption) { *_schedule = malloc(16 * 15); OQS_EXIT_IF_NULLPTR(*_schedule); uint8_t *schedule = (uint8_t *) *_schedule; @@ -288,11 +288,7 @@ void OQS_AES256_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED i } } -void OQS_AES256_CTR_load_schedule(const uint8_t *key, void **_schedule) { - OQS_AES256_ECB_load_schedule(key, _schedule, 1); -} - -void OQS_AES256_free_schedule(void *schedule) { +void oqs_aes256_free_schedule_c(void *schedule) { if (schedule != NULL) { OQS_MEM_secure_free(schedule, 16 * 15); } @@ -446,102 +442,3 @@ void oqs_aes256_dec_sch_block_c(const uint8_t *ciphertext, const void *_schedule // Reverse the first Round xor_round_key(plaintext, schedule, 0); } - -void OQS_AES128_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) { - void *schedule = NULL; - OQS_AES128_ECB_load_schedule(key, &schedule, 1); - OQS_AES128_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext); - OQS_AES128_free_schedule(schedule); -} - -void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { - assert(plaintext_len % 16 == 0); - for (size_t block = 0; block < plaintext_len / 16; block++) { - oqs_aes128_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block)); - } -} - -void OQS_AES128_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) { - void *schedule = NULL; - OQS_AES128_ECB_load_schedule(key, &schedule, 0); - OQS_AES128_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext); - OQS_AES128_free_schedule(schedule); -} - -void OQS_AES128_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { - assert(ciphertext_len % 16 == 0); - for (size_t block = 0; block < ciphertext_len / 16; block++) { - oqs_aes128_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block)); - } -} - -void OQS_AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) { - void *schedule = NULL; - OQS_AES256_ECB_load_schedule(key, &schedule, 1); - OQS_AES256_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext); - OQS_AES256_free_schedule(schedule); -} - -void OQS_AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { - assert(plaintext_len % 16 == 0); - for (size_t block = 0; block < plaintext_len / 16; block++) { - oqs_aes256_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block)); - } -} - -void OQS_AES256_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) { - void *schedule = NULL; - OQS_AES256_ECB_load_schedule(key, &schedule, 0); - OQS_AES256_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext); - OQS_AES256_free_schedule(schedule); -} - -void OQS_AES256_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) { - assert(ciphertext_len % 16 == 0); - for (size_t block = 0; block < ciphertext_len / 16; block++) { - oqs_aes256_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block)); - } -} - -static inline uint32_t UINT32_TO_BE(const uint32_t x) { - union { - uint32_t val; - uint8_t bytes[4]; - } y; - y.bytes[0] = (x >> 24) & 0xFF; - y.bytes[1] = (x >> 16) & 0xFF; - y.bytes[2] = (x >> 8) & 0xFF; - y.bytes[3] = x & 0xFF; - return y.val; -} -#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0)) - -void OQS_AES256_CTR_sch(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { - uint8_t block[16]; - uint32_t ctr; - uint32_t ctr_be; - memcpy(block, iv, 12); - if (iv_len == 12) { - ctr = 0; - } else if (iv_len == 16) { - memcpy(&ctr_be, &iv[12], 4); - ctr = BE_TO_UINT32(ctr_be); - } else { - exit(EXIT_FAILURE); - } - while (out_len >= 16) { - ctr_be = UINT32_TO_BE(ctr); - memcpy(&block[12], (uint8_t *) &ctr_be, 4); - oqs_aes256_enc_sch_block_c(block, schedule, out); - out += 16; - out_len -= 16; - ctr++; - } - if (out_len > 0) { - uint8_t tmp[16]; - ctr_be = UINT32_TO_BE(ctr); - memcpy(&block[12], (uint8_t *) &ctr_be, 4); - oqs_aes256_enc_sch_block_c(block, schedule, tmp); - memcpy(out, tmp, out_len); - } -} diff --git a/src/common/aes/aes_local.h b/src/common/aes/aes_local.h new file mode 100644 index 000000000..04fb954b4 --- /dev/null +++ b/src/common/aes/aes_local.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT + +#include + +void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule); +void oqs_aes128_free_schedule_ni(void *schedule); +void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext); +void oqs_aes128_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext); +void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +void oqs_aes128_ecb_dec_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); + +void oqs_aes128_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption); +void oqs_aes128_free_schedule_c(void *schedule); +void oqs_aes128_enc_sch_block_c(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext); +void oqs_aes128_dec_sch_block_c(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext); +void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +void oqs_aes128_ecb_dec_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); + +void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule, int for_encryption); +void oqs_aes256_free_schedule_ni(void *schedule); +void oqs_aes256_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext); +void oqs_aes256_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext); +void oqs_aes256_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +void oqs_aes256_ecb_dec_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); + +void oqs_aes256_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption); +void oqs_aes256_free_schedule_c(void *schedule); +void oqs_aes256_enc_sch_block_c(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext); +void oqs_aes256_dec_sch_block_c(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext); +void oqs_aes256_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +void oqs_aes256_ecb_dec_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); diff --git a/tests/system_info.c b/tests/system_info.c index c2a8704c5..59875febe 100644 --- a/tests/system_info.c +++ b/tests/system_info.c @@ -38,6 +38,22 @@ static void print_platform_info(void) { #include #endif +#if defined(OQS_USE_CPU_EXTENSIONS) && defined(OQS_PORTABLE_BUILD) +#define C_OR_NI(stmt_c, stmt_ni) \ + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); \ + if (available_cpu_extensions.AES_ENABLED) { \ + stmt_ni; \ + } else { \ + stmt_c; \ + } +#elif defined(OQS_USE_CPU_EXTENSIONS) /* && !defined(OQS_PORTABLE_BUILD) */ +#define C_OR_NI(stmt_c, stmt_ni) \ + stmt_ni; +#else /* !defined(OQS_USE_CPU_EXTENSIONS) */ +#define C_OR_NI(stmt_c, stmt_ni) \ + stmt_c; +#endif + static void print_oqs_configuration(void) { printf("OQS version: %s\n", OQS_VERSION_TEXT); #if defined(OQS_COMPILE_GIT_COMMIT) @@ -51,7 +67,10 @@ static void print_oqs_configuration(void) { #if defined(OQS_USE_AES_OPENSSL) printf("AES: OpenSSL\n"); #else - printf("AES: C\n"); + C_OR_NI( + printf("AES: C\n"), + printf("AES: NI\n") + ) #endif #if defined(OQS_USE_SHA2_OPENSSL) printf("SHA-2: OpenSSL\n"); diff --git a/tests/test_aes.c b/tests/test_aes.c index 3d3ab336d..928379ffd 100644 --- a/tests/test_aes.c +++ b/tests/test_aes.c @@ -109,7 +109,7 @@ static int test_aes256ctr_correctness(void) { static void speed_aes128(void) { uint8_t plaintext[16], ciphertext[16]; void *schedule = NULL, *schedule_dec = NULL; - TIME_OPERATION_SECONDS({ OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule, 1); OQS_AES128_free_schedule(schedule); }, "OQS_AES128_ECB_load+free_schedule", BENCH_DURATION); + TIME_OPERATION_SECONDS({ OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule, 1); OQS_AES128_free_schedule(schedule); }, "OQS_AES128_ECB_load+free_sch", BENCH_DURATION); OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule, 1); OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule_dec, 0); @@ -124,7 +124,7 @@ static void speed_aes128(void) { static void speed_aes256(void) { uint8_t plaintext[16], ciphertext[16]; void *schedule = NULL, *schedule_dec = NULL; - TIME_OPERATION_SECONDS({ OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule, 1); OQS_AES256_free_schedule(schedule); }, "OQS_AES256_ECB_load+free_schedule", BENCH_DURATION); + TIME_OPERATION_SECONDS({ OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule, 1); OQS_AES256_free_schedule(schedule); }, "OQS_AES256_ECB_load+free_sch", BENCH_DURATION); OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule, 1); OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule_dec, 0);