Re-add AES-NI with CPU feature detection (#770)

* Start re-adding AES-NI with CPU feature detection

* Follow Goutam's feedback on CPU feature detection

* Macrify CPU feature detection logic in AES

* AES128 and AES256 using NI based on Romain Dolbeau's public domain code

* Fewer calls to C_OR_NI

* Restricting setting of OQS_USE_CPU_EXTENSIONS.

* Unroll AES loops

Co-authored-by: xvzcf <xvzcf@users.noreply.github.com>
This commit is contained in:
Douglas Stebila 2020-07-24 13:53:14 -04:00 committed by GitHub
parent a3a5347202
commit 34ec4b83a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 564 additions and 116 deletions

View File

@ -1,5 +1,8 @@
# SPDX-License-Identifier: MIT
option(OQS_PORTABLE_BUILD "Ensure the resulting library is portable. This implies having run-time checks for CPU extensions." ON)
option(OQS_BUILD_ONLY_LIB "Build only liboqs and do not expose build targets for tests, documentation, and pretty-printing available." OFF)
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
add_compile_options(-Werror)
add_compile_options(-Wall)
@ -13,6 +16,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang")
set(OQS_USE_PTHREADS_IN_TESTS 1)
endif()
option(OQS_USE_CPU_EXTENSIONS "Enable compile and run-time support for CPU extensions such as AVX2, SSE, etc." ON)
if(OQS_USE_CPU_EXTENSIONS)
include(${CMAKE_CURRENT_LIST_DIR}/gcc_clang_intrinsics.cmake)
endif()
@ -67,6 +71,7 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU")
set(OQS_USE_PTHREADS_IN_TESTS 1)
endif()
option(OQS_USE_CPU_EXTENSIONS "Enable compile and run-time support for CPU extensions such as AVX2, SSE, etc." ON)
if(OQS_USE_CPU_EXTENSIONS)
include(${CMAKE_CURRENT_LIST_DIR}/gcc_clang_intrinsics.cmake)
endif()

View File

@ -35,14 +35,9 @@ endif()
if(WIN32)
set(CMAKE_GENERATOR_CC cl)
endif()
option(OQS_USE_CPU_EXTENSIONS "Enable compile and run-time support for CPU extensions such as AVX2, SSE, etc." ON)
option(OQS_PORTABLE_BUILD "Ensure the resulting library is portable. This implies having run-time checks for CPU extensions." ON)
option(OQS_BUILD_ONLY_LIB "Build only liboqs and do not expose build targets for tests, documentation, and pretty-printing available." OFF)
include(.CMake/compiler_opts.cmake)
include(.CMake/alg_support.cmake)
if(OQS_USE_OPENSSL)
if(NOT DEFINED OPENSSL_ROOT_DIR)
if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Darwin")

View File

@ -172,6 +172,7 @@ liboqs includes some third party libraries or modules that are licensed differen
- `.CMake/CMakeDependentOption.cmake`: BSD 3-Clause License
- `src/common/common.c`: includes portions which are Apache License v2.0
- `src/common/crypto/aes/aes_c.c`: public domain or any OSI-approved license
- `src/common/crypto/aes/aes*_ni.c`: public domain
- `src/common/crypto/sha2/sha2_c.c`: public domain
- `src/common/crypto/sha3/fips202.c`: public domain
- `src/common/crypto/sha3/keccak4x`: CC0 (public domain), except `brg_endian.h`

View File

@ -13,7 +13,13 @@ endif()
if(OQS_USE_AES_OPENSSL)
set(AES_IMPL aes/aes_ossl.c)
else()
set(AES_IMPL aes/aes_c.c)
set(AES_IMPL aes/aes.c aes/aes_c.c)
if (OQS_USE_AES_INSTRUCTIONS)
set(AES_IMPL ${AES_IMPL} aes/aes128_ni.c)
set(AES_IMPL ${AES_IMPL} aes/aes256_ni.c)
set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS -maes)
set_source_files_properties(aes/aes256_ni.c PROPERTIES COMPILE_FLAGS -maes)
endif()
endif()
if(OQS_USE_SHA2_OPENSSL)

219
src/common/aes/aes.c Normal file
View File

@ -0,0 +1,219 @@
// SPDX-License-Identifier: MIT
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <oqs/common.h>
#include "aes.h"
#include "aes_local.h"
#if defined(OQS_USE_CPU_EXTENSIONS) && defined(OQS_PORTABLE_BUILD)
#define C_OR_NI(stmt_c, stmt_ni) \
OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); \
if (available_cpu_extensions.AES_ENABLED) { \
stmt_ni; \
} else { \
stmt_c; \
}
#elif defined(OQS_USE_CPU_EXTENSIONS) /* && !defined(OQS_PORTABLE_BUILD) */
#define C_OR_NI(stmt_c, stmt_ni) \
stmt_ni;
#else /* !defined(OQS_USE_CPU_EXTENSIONS) */
#define C_OR_NI(stmt_c, stmt_ni) \
stmt_c;
#endif
void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED int for_encryption) {
C_OR_NI(
oqs_aes128_load_schedule_c(key, _schedule, for_encryption),
oqs_aes128_load_schedule_ni(key, _schedule)
)
}
void OQS_AES128_free_schedule(void *schedule) {
C_OR_NI(
oqs_aes128_free_schedule_c(schedule),
oqs_aes128_free_schedule_ni(schedule)
)
}
void OQS_AES256_ECB_load_schedule(const uint8_t *key, void **_schedule, int for_encryption) {
C_OR_NI(
oqs_aes256_load_schedule_c(key, _schedule, for_encryption),
oqs_aes256_load_schedule_ni(key, _schedule, for_encryption)
)
}
void OQS_AES256_CTR_load_schedule(const uint8_t *key, void **_schedule) {
OQS_AES256_ECB_load_schedule(key, _schedule, 1);
}
void OQS_AES256_free_schedule(void *schedule) {
C_OR_NI(
oqs_aes256_free_schedule_c(schedule),
oqs_aes256_free_schedule_ni(schedule)
)
}
void OQS_AES128_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
void *schedule = NULL;
OQS_AES128_ECB_load_schedule(key, &schedule, 1);
OQS_AES128_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext);
OQS_AES128_free_schedule(schedule);
}
inline void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
for (size_t block = 0; block < plaintext_len / 16; block++) {
oqs_aes128_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block));
}
}
inline void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
for (size_t block = 0; block < plaintext_len / 16; block++) {
oqs_aes128_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block));
}
}
void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
C_OR_NI(
oqs_aes128_ecb_enc_sch_c(plaintext, plaintext_len, schedule, ciphertext),
oqs_aes128_ecb_enc_sch_ni(plaintext, plaintext_len, schedule, ciphertext)
)
}
inline void oqs_aes128_ecb_dec_sch_c(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
assert(ciphertext_len % 16 == 0);
for (size_t block = 0; block < ciphertext_len / 16; block++) {
oqs_aes128_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block));
}
}
inline void oqs_aes128_ecb_dec_sch_ni(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
assert(ciphertext_len % 16 == 0);
for (size_t block = 0; block < ciphertext_len / 16; block++) {
oqs_aes128_dec_sch_block_ni(ciphertext + (16 * block), schedule, plaintext + (16 * block));
}
}
void OQS_AES128_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) {
void *schedule = NULL;
OQS_AES128_ECB_load_schedule(key, &schedule, 0);
OQS_AES128_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext);
OQS_AES128_free_schedule(schedule);
}
void OQS_AES128_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
C_OR_NI(
oqs_aes128_ecb_dec_sch_c(ciphertext, ciphertext_len, schedule, plaintext),
oqs_aes128_ecb_dec_sch_ni(ciphertext, ciphertext_len, schedule, plaintext)
)
}
void OQS_AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
void *schedule = NULL;
OQS_AES256_ECB_load_schedule(key, &schedule, 1);
OQS_AES256_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext);
OQS_AES256_free_schedule(schedule);
}
inline void oqs_aes256_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
for (size_t block = 0; block < plaintext_len / 16; block++) {
oqs_aes256_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block));
}
}
inline void oqs_aes256_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
for (size_t block = 0; block < plaintext_len / 16; block++) {
oqs_aes256_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block));
}
}
void OQS_AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
C_OR_NI(
oqs_aes256_ecb_enc_sch_c(plaintext, plaintext_len, schedule, ciphertext),
oqs_aes256_ecb_enc_sch_ni(plaintext, plaintext_len, schedule, ciphertext)
)
}
void OQS_AES256_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) {
void *schedule = NULL;
OQS_AES256_ECB_load_schedule(key, &schedule, 0);
OQS_AES256_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext);
OQS_AES256_free_schedule(schedule);
}
inline void oqs_aes256_ecb_dec_sch_c(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
assert(ciphertext_len % 16 == 0);
for (size_t block = 0; block < ciphertext_len / 16; block++) {
oqs_aes256_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block));
}
}
inline void oqs_aes256_ecb_dec_sch_ni(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
assert(ciphertext_len % 16 == 0);
for (size_t block = 0; block < ciphertext_len / 16; block++) {
oqs_aes256_dec_sch_block_ni(ciphertext + (16 * block), schedule, plaintext + (16 * block));
}
}
void OQS_AES256_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
C_OR_NI(
oqs_aes256_ecb_dec_sch_c(ciphertext, ciphertext_len, schedule, plaintext),
oqs_aes256_ecb_dec_sch_ni(ciphertext, ciphertext_len, schedule, plaintext)
)
}
static inline uint32_t UINT32_TO_BE(const uint32_t x) {
union {
uint32_t val;
uint8_t bytes[4];
} y;
y.bytes[0] = (x >> 24) & 0xFF;
y.bytes[1] = (x >> 16) & 0xFF;
y.bytes[2] = (x >> 8) & 0xFF;
y.bytes[3] = x & 0xFF;
return y.val;
}
#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0))
void OQS_AES256_CTR_sch(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
uint8_t block[16];
uint32_t ctr;
uint32_t ctr_be;
memcpy(block, iv, 12);
if (iv_len == 12) {
ctr = 0;
} else if (iv_len == 16) {
memcpy(&ctr_be, &iv[12], 4);
ctr = BE_TO_UINT32(ctr_be);
} else {
exit(EXIT_FAILURE);
}
while (out_len >= 16) {
ctr_be = UINT32_TO_BE(ctr);
memcpy(&block[12], (uint8_t *) &ctr_be, 4);
C_OR_NI(
oqs_aes256_enc_sch_block_c(block, schedule, out),
oqs_aes256_enc_sch_block_ni(block, schedule, out)
)
out += 16;
out_len -= 16;
ctr++;
}
if (out_len > 0) {
uint8_t tmp[16];
ctr_be = UINT32_TO_BE(ctr);
memcpy(&block[12], (uint8_t *) &ctr_be, 4);
C_OR_NI(
oqs_aes256_enc_sch_block_c(block, schedule, tmp),
oqs_aes256_enc_sch_block_ni(block, schedule, tmp)
)
memcpy(out, tmp, out_len);
}
}

120
src/common/aes/aes128_ni.c Normal file
View File

@ -0,0 +1,120 @@
// SPDX-License-Identifier: Public domain
// Based on public domain code by Romain Dolbeau
// http://dolbeau.name/dolbeau/crypto/crypto.html
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <oqs/common.h>
#include <wmmintrin.h>
// From crypto_core/aes128ncrypt/dolbeau/aesenc-int
static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[11]) {
__m128i key0 = _mm_loadu_si128((const __m128i_u *)(key + 0));
__m128i temp0, temp1, temp4;
int idx = 0;
temp0 = key0;
/* blockshift-based block by Cedric Bourrasset */
#define BLOCK1(IMM) \
temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
rkeys[idx++] = temp0; \
temp4 = _mm_slli_si128(temp0,4); \
temp0 = _mm_xor_si128(temp0,temp4); \
temp4 = _mm_slli_si128(temp0,8); \
temp0 = _mm_xor_si128(temp0,temp4); \
temp1 = _mm_shuffle_epi32(temp1,0xff); \
temp0 = _mm_xor_si128(temp0,temp1)
BLOCK1(0x01);
BLOCK1(0x02);
BLOCK1(0x04);
BLOCK1(0x08);
BLOCK1(0x10);
BLOCK1(0x20);
BLOCK1(0x40);
BLOCK1(0x80);
BLOCK1(0x1b);
BLOCK1(0x36);
rkeys[idx++] = temp0;
}
// From crypto_core/aes128decrypt/dolbeau/aesenc-int
static inline void aes128ni_setkey_decrypt(const unsigned char *key, __m128i rkeys[11]) {
__m128i tkeys[11];
aes128ni_setkey_encrypt(key, tkeys);
rkeys[0] = tkeys[10];
rkeys[1] = _mm_aesimc_si128(tkeys[9]);
rkeys[2] = _mm_aesimc_si128(tkeys[8]);
rkeys[3] = _mm_aesimc_si128(tkeys[7]);
rkeys[4] = _mm_aesimc_si128(tkeys[6]);
rkeys[5] = _mm_aesimc_si128(tkeys[5]);
rkeys[6] = _mm_aesimc_si128(tkeys[4]);
rkeys[7] = _mm_aesimc_si128(tkeys[3]);
rkeys[8] = _mm_aesimc_si128(tkeys[2]);
rkeys[9] = _mm_aesimc_si128(tkeys[1]);
rkeys[10] = tkeys[0];
}
void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule, int for_encryption) {
*_schedule = malloc(11 * sizeof(__m128i));
assert(*_schedule != NULL);
__m128i *schedule = (__m128i *) *_schedule;
if (for_encryption) {
aes128ni_setkey_encrypt(key, schedule);
} else {
aes128ni_setkey_decrypt(key, schedule);
}
}
void oqs_aes128_free_schedule_ni(void *schedule) {
if (schedule != NULL) {
OQS_MEM_secure_free(schedule, 11 * sizeof(__m128i));
}
}
// From crypto_core/aes128encrypt/dolbeau/aesenc-int
static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char *n, unsigned char *out) {
__m128i nv = _mm_load_si128((const __m128i *)n);
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
temp = _mm_aesenc_si128(temp, rkeys[1]);
temp = _mm_aesenc_si128(temp, rkeys[2]);
temp = _mm_aesenc_si128(temp, rkeys[3]);
temp = _mm_aesenc_si128(temp, rkeys[4]);
temp = _mm_aesenc_si128(temp, rkeys[5]);
temp = _mm_aesenc_si128(temp, rkeys[6]);
temp = _mm_aesenc_si128(temp, rkeys[7]);
temp = _mm_aesenc_si128(temp, rkeys[8]);
temp = _mm_aesenc_si128(temp, rkeys[9]);
temp = _mm_aesenclast_si128(temp, rkeys[10]);
_mm_store_si128((__m128i *)(out), temp);
}
void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) {
const __m128i *schedule = (const __m128i *) _schedule;
aes128ni_encrypt(schedule, plaintext, ciphertext);
}
// From crypto_core/aes128decrypt/dolbeau/aesenc-int
static inline void aes128ni_decrypt(const __m128i rkeys[11], const unsigned char *n, unsigned char *out) {
__m128i nv = _mm_load_si128((const __m128i *)n);
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
temp = _mm_aesdec_si128(temp, rkeys[1]);
temp = _mm_aesdec_si128(temp, rkeys[2]);
temp = _mm_aesdec_si128(temp, rkeys[3]);
temp = _mm_aesdec_si128(temp, rkeys[4]);
temp = _mm_aesdec_si128(temp, rkeys[5]);
temp = _mm_aesdec_si128(temp, rkeys[6]);
temp = _mm_aesdec_si128(temp, rkeys[7]);
temp = _mm_aesdec_si128(temp, rkeys[8]);
temp = _mm_aesdec_si128(temp, rkeys[9]);
temp = _mm_aesdeclast_si128(temp, rkeys[10]);
_mm_store_si128((__m128i *)(out), temp);
}
void oqs_aes128_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext) {
const __m128i *schedule = (const __m128i *) _schedule;
aes128ni_decrypt(schedule, ciphertext, plaintext);
}

155
src/common/aes/aes256_ni.c Normal file
View File

@ -0,0 +1,155 @@
// SPDX-License-Identifier: Public domain
// Based on public domain code by Romain Dolbeau
// http://dolbeau.name/dolbeau/crypto/crypto.html
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <oqs/common.h>
#include <wmmintrin.h>
// From crypto_core/aes256encrypt/dolbeau/aesenc-int
static inline void aes256ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[15]) {
__m128i key0 = _mm_loadu_si128((const __m128i_u *)(key + 0));
__m128i key1 = _mm_loadu_si128((const __m128i_u *)(key + 16));
__m128i temp0, temp1, temp2, temp4;
int idx = 0;
rkeys[idx++] = key0;
temp0 = key0;
temp2 = key1;
/* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */
#define BLOCK1(IMM) \
temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \
rkeys[idx++] = temp2; \
temp4 = _mm_slli_si128(temp0,4); \
temp0 = _mm_xor_si128(temp0,temp4); \
temp4 = _mm_slli_si128(temp0,8); \
temp0 = _mm_xor_si128(temp0,temp4); \
temp1 = _mm_shuffle_epi32(temp1,0xff); \
temp0 = _mm_xor_si128(temp0,temp1)
#define BLOCK2(IMM) \
temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
rkeys[idx++] = temp0; \
temp4 = _mm_slli_si128(temp2,4); \
temp2 = _mm_xor_si128(temp2,temp4); \
temp4 = _mm_slli_si128(temp2,8); \
temp2 = _mm_xor_si128(temp2,temp4); \
temp1 = _mm_shuffle_epi32(temp1,0xaa); \
temp2 = _mm_xor_si128(temp2,temp1)
BLOCK1(0x01);
BLOCK2(0x01);
BLOCK1(0x02);
BLOCK2(0x02);
BLOCK1(0x04);
BLOCK2(0x04);
BLOCK1(0x08);
BLOCK2(0x08);
BLOCK1(0x10);
BLOCK2(0x10);
BLOCK1(0x20);
BLOCK2(0x20);
BLOCK1(0x40);
rkeys[idx++] = temp0;
}
// From crypto_core/aes256decrypt/dolbeau/aesenc-int
static inline void aes256ni_setkey_decrypt(const unsigned char *key, __m128i rkeys[15]) {
__m128i tkeys[15];
aes256ni_setkey_encrypt(key, tkeys);
rkeys[0] = tkeys[14];
rkeys[1] = _mm_aesimc_si128(tkeys[13]);
rkeys[2] = _mm_aesimc_si128(tkeys[12]);
rkeys[3] = _mm_aesimc_si128(tkeys[11]);
rkeys[4] = _mm_aesimc_si128(tkeys[10]);
rkeys[5] = _mm_aesimc_si128(tkeys[9]);
rkeys[6] = _mm_aesimc_si128(tkeys[8]);
rkeys[7] = _mm_aesimc_si128(tkeys[7]);
rkeys[8] = _mm_aesimc_si128(tkeys[6]);
rkeys[9] = _mm_aesimc_si128(tkeys[5]);
rkeys[10] = _mm_aesimc_si128(tkeys[4]);
rkeys[11] = _mm_aesimc_si128(tkeys[3]);
rkeys[12] = _mm_aesimc_si128(tkeys[2]);
rkeys[13] = _mm_aesimc_si128(tkeys[1]);
rkeys[14] = tkeys[0];
}
void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule, int for_encryption) {
*_schedule = malloc(15 * sizeof(__m128i));
assert(*_schedule != NULL);
__m128i *schedule = (__m128i *) *_schedule;
if (for_encryption) {
aes256ni_setkey_encrypt(key, schedule);
} else {
aes256ni_setkey_decrypt(key, schedule);
}
}
void oqs_aes256_free_schedule_ni(void *schedule) {
if (schedule != NULL) {
OQS_MEM_secure_free(schedule, 15 * sizeof(__m128i));
}
}
// From crypto_core/aes256encrypt/dolbeau/aesenc-int
static inline void aes256ni_encrypt(const __m128i rkeys[15], const unsigned char *n, unsigned char *out) {
__m128i nv = _mm_load_si128((const __m128i *)n);
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
temp = _mm_aesenc_si128(temp, rkeys[1]);
temp = _mm_aesenc_si128(temp, rkeys[2]);
temp = _mm_aesenc_si128(temp, rkeys[3]);
temp = _mm_aesenc_si128(temp, rkeys[4]);
temp = _mm_aesenc_si128(temp, rkeys[5]);
temp = _mm_aesenc_si128(temp, rkeys[6]);
temp = _mm_aesenc_si128(temp, rkeys[7]);
temp = _mm_aesenc_si128(temp, rkeys[8]);
temp = _mm_aesenc_si128(temp, rkeys[9]);
temp = _mm_aesenc_si128(temp, rkeys[10]);
temp = _mm_aesenc_si128(temp, rkeys[11]);
temp = _mm_aesenc_si128(temp, rkeys[12]);
temp = _mm_aesenc_si128(temp, rkeys[13]);
temp = _mm_aesenclast_si128(temp, rkeys[14]);
_mm_store_si128((__m128i *)(out), temp);
}
void oqs_aes256_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) {
const __m128i *schedule = (const __m128i *) _schedule;
aes256ni_encrypt(schedule, plaintext, ciphertext);
}
// From crypto_core/aes256decrypt/dolbeau/aesenc-int
static inline void aes256ni_decrypt(const __m128i rkeys[15], const unsigned char *n, unsigned char *out) {
__m128i nv = _mm_load_si128((const __m128i *)n);
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
temp = _mm_aesdec_si128(temp, rkeys[1]);
temp = _mm_aesdec_si128(temp, rkeys[2]);
temp = _mm_aesdec_si128(temp, rkeys[3]);
temp = _mm_aesdec_si128(temp, rkeys[4]);
temp = _mm_aesdec_si128(temp, rkeys[5]);
temp = _mm_aesdec_si128(temp, rkeys[6]);
temp = _mm_aesdec_si128(temp, rkeys[7]);
temp = _mm_aesdec_si128(temp, rkeys[8]);
temp = _mm_aesdec_si128(temp, rkeys[9]);
temp = _mm_aesdec_si128(temp, rkeys[10]);
temp = _mm_aesdec_si128(temp, rkeys[11]);
temp = _mm_aesdec_si128(temp, rkeys[12]);
temp = _mm_aesdec_si128(temp, rkeys[13]);
temp = _mm_aesdeclast_si128(temp, rkeys[14]);
_mm_store_si128((__m128i *)(out), temp);
}
void oqs_aes256_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext) {
const __m128i *schedule = (const __m128i *) _schedule;
aes256ni_decrypt(schedule, ciphertext, plaintext);
}

View File

@ -220,7 +220,7 @@ static void key_schedule_core(byte *a, int i) {
// Expand the 16-byte key to 11 round keys (176 bytes)
// http://en.wikipedia.org/wiki/Rijndael_key_schedule#The_key_schedule
void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED int for_encryption) {
void oqs_aes128_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption) {
*_schedule = malloc(16 * 11);
OQS_EXIT_IF_NULLPTR(*_schedule);
uint8_t *schedule = (uint8_t *) *_schedule;
@ -248,7 +248,7 @@ void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED i
}
}
void OQS_AES128_free_schedule(void *schedule) {
void oqs_aes128_free_schedule_c(void *schedule) {
if (schedule != NULL) {
OQS_MEM_secure_free(schedule, 176);
}
@ -256,7 +256,7 @@ void OQS_AES128_free_schedule(void *schedule) {
// Expand the 16-byte key to 15 round keys (240 bytes)
// http://en.wikipedia.org/wiki/Rijndael_key_schedule#The_key_schedule
void OQS_AES256_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED int for_encryption) {
void oqs_aes256_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption) {
*_schedule = malloc(16 * 15);
OQS_EXIT_IF_NULLPTR(*_schedule);
uint8_t *schedule = (uint8_t *) *_schedule;
@ -288,11 +288,7 @@ void OQS_AES256_ECB_load_schedule(const uint8_t *key, void **_schedule, UNUSED i
}
}
void OQS_AES256_CTR_load_schedule(const uint8_t *key, void **_schedule) {
OQS_AES256_ECB_load_schedule(key, _schedule, 1);
}
void OQS_AES256_free_schedule(void *schedule) {
void oqs_aes256_free_schedule_c(void *schedule) {
if (schedule != NULL) {
OQS_MEM_secure_free(schedule, 16 * 15);
}
@ -446,102 +442,3 @@ void oqs_aes256_dec_sch_block_c(const uint8_t *ciphertext, const void *_schedule
// Reverse the first Round
xor_round_key(plaintext, schedule, 0);
}
void OQS_AES128_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
void *schedule = NULL;
OQS_AES128_ECB_load_schedule(key, &schedule, 1);
OQS_AES128_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext);
OQS_AES128_free_schedule(schedule);
}
void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
for (size_t block = 0; block < plaintext_len / 16; block++) {
oqs_aes128_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block));
}
}
void OQS_AES128_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) {
void *schedule = NULL;
OQS_AES128_ECB_load_schedule(key, &schedule, 0);
OQS_AES128_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext);
OQS_AES128_free_schedule(schedule);
}
void OQS_AES128_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
assert(ciphertext_len % 16 == 0);
for (size_t block = 0; block < ciphertext_len / 16; block++) {
oqs_aes128_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block));
}
}
void OQS_AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
void *schedule = NULL;
OQS_AES256_ECB_load_schedule(key, &schedule, 1);
OQS_AES256_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext);
OQS_AES256_free_schedule(schedule);
}
void OQS_AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
for (size_t block = 0; block < plaintext_len / 16; block++) {
oqs_aes256_enc_sch_block_c(plaintext + (16 * block), schedule, ciphertext + (16 * block));
}
}
void OQS_AES256_ECB_dec(const uint8_t *ciphertext, const size_t ciphertext_len, const uint8_t *key, uint8_t *plaintext) {
void *schedule = NULL;
OQS_AES256_ECB_load_schedule(key, &schedule, 0);
OQS_AES256_ECB_dec_sch(ciphertext, ciphertext_len, schedule, plaintext);
OQS_AES256_free_schedule(schedule);
}
void OQS_AES256_ECB_dec_sch(const uint8_t *ciphertext, const size_t ciphertext_len, const void *schedule, uint8_t *plaintext) {
assert(ciphertext_len % 16 == 0);
for (size_t block = 0; block < ciphertext_len / 16; block++) {
oqs_aes256_dec_sch_block_c(ciphertext + (16 * block), schedule, plaintext + (16 * block));
}
}
static inline uint32_t UINT32_TO_BE(const uint32_t x) {
union {
uint32_t val;
uint8_t bytes[4];
} y;
y.bytes[0] = (x >> 24) & 0xFF;
y.bytes[1] = (x >> 16) & 0xFF;
y.bytes[2] = (x >> 8) & 0xFF;
y.bytes[3] = x & 0xFF;
return y.val;
}
#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0))
void OQS_AES256_CTR_sch(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
uint8_t block[16];
uint32_t ctr;
uint32_t ctr_be;
memcpy(block, iv, 12);
if (iv_len == 12) {
ctr = 0;
} else if (iv_len == 16) {
memcpy(&ctr_be, &iv[12], 4);
ctr = BE_TO_UINT32(ctr_be);
} else {
exit(EXIT_FAILURE);
}
while (out_len >= 16) {
ctr_be = UINT32_TO_BE(ctr);
memcpy(&block[12], (uint8_t *) &ctr_be, 4);
oqs_aes256_enc_sch_block_c(block, schedule, out);
out += 16;
out_len -= 16;
ctr++;
}
if (out_len > 0) {
uint8_t tmp[16];
ctr_be = UINT32_TO_BE(ctr);
memcpy(&block[12], (uint8_t *) &ctr_be, 4);
oqs_aes256_enc_sch_block_c(block, schedule, tmp);
memcpy(out, tmp, out_len);
}
}

View File

@ -0,0 +1,31 @@
// SPDX-License-Identifier: MIT
#include <stdint.h>
void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule);
void oqs_aes128_free_schedule_ni(void *schedule);
void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
void oqs_aes128_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext);
void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
void oqs_aes128_ecb_dec_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
void oqs_aes128_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption);
void oqs_aes128_free_schedule_c(void *schedule);
void oqs_aes128_enc_sch_block_c(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
void oqs_aes128_dec_sch_block_c(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext);
void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
void oqs_aes128_ecb_dec_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule, int for_encryption);
void oqs_aes256_free_schedule_ni(void *schedule);
void oqs_aes256_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
void oqs_aes256_dec_sch_block_ni(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext);
void oqs_aes256_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
void oqs_aes256_ecb_dec_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
void oqs_aes256_load_schedule_c(const uint8_t *key, void **_schedule, UNUSED int for_encryption);
void oqs_aes256_free_schedule_c(void *schedule);
void oqs_aes256_enc_sch_block_c(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
void oqs_aes256_dec_sch_block_c(const uint8_t *ciphertext, const void *_schedule, uint8_t *plaintext);
void oqs_aes256_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
void oqs_aes256_ecb_dec_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);

View File

@ -38,6 +38,22 @@ static void print_platform_info(void) {
#include <openssl/opensslv.h>
#endif
#if defined(OQS_USE_CPU_EXTENSIONS) && defined(OQS_PORTABLE_BUILD)
#define C_OR_NI(stmt_c, stmt_ni) \
OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); \
if (available_cpu_extensions.AES_ENABLED) { \
stmt_ni; \
} else { \
stmt_c; \
}
#elif defined(OQS_USE_CPU_EXTENSIONS) /* && !defined(OQS_PORTABLE_BUILD) */
#define C_OR_NI(stmt_c, stmt_ni) \
stmt_ni;
#else /* !defined(OQS_USE_CPU_EXTENSIONS) */
#define C_OR_NI(stmt_c, stmt_ni) \
stmt_c;
#endif
static void print_oqs_configuration(void) {
printf("OQS version: %s\n", OQS_VERSION_TEXT);
#if defined(OQS_COMPILE_GIT_COMMIT)
@ -51,7 +67,10 @@ static void print_oqs_configuration(void) {
#if defined(OQS_USE_AES_OPENSSL)
printf("AES: OpenSSL\n");
#else
printf("AES: C\n");
C_OR_NI(
printf("AES: C\n"),
printf("AES: NI\n")
)
#endif
#if defined(OQS_USE_SHA2_OPENSSL)
printf("SHA-2: OpenSSL\n");

View File

@ -109,7 +109,7 @@ static int test_aes256ctr_correctness(void) {
static void speed_aes128(void) {
uint8_t plaintext[16], ciphertext[16];
void *schedule = NULL, *schedule_dec = NULL;
TIME_OPERATION_SECONDS({ OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule, 1); OQS_AES128_free_schedule(schedule); }, "OQS_AES128_ECB_load+free_schedule", BENCH_DURATION);
TIME_OPERATION_SECONDS({ OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule, 1); OQS_AES128_free_schedule(schedule); }, "OQS_AES128_ECB_load+free_sch", BENCH_DURATION);
OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule, 1);
OQS_AES128_ECB_load_schedule(test_aes128_key, &schedule_dec, 0);
@ -124,7 +124,7 @@ static void speed_aes128(void) {
static void speed_aes256(void) {
uint8_t plaintext[16], ciphertext[16];
void *schedule = NULL, *schedule_dec = NULL;
TIME_OPERATION_SECONDS({ OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule, 1); OQS_AES256_free_schedule(schedule); }, "OQS_AES256_ECB_load+free_schedule", BENCH_DURATION);
TIME_OPERATION_SECONDS({ OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule, 1); OQS_AES256_free_schedule(schedule); }, "OQS_AES256_ECB_load+free_sch", BENCH_DURATION);
OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule, 1);
OQS_AES256_ECB_load_schedule(test_aes256_key, &schedule_dec, 0);