From d91030d31ed1add9fb89c382b21ebd3355529636 Mon Sep 17 00:00:00 2001 From: Douglas Stebila Date: Thu, 27 Jun 2019 14:35:08 -0400 Subject: [PATCH] Use Frodo optimized C implementation --- src/kem/frodokem/external/frodo1344aes.c | 2 +- src/kem/frodokem/external/frodo1344shake.c | 2 +- src/kem/frodokem/external/frodo640aes.c | 2 +- src/kem/frodokem/external/frodo640shake.c | 2 +- src/kem/frodokem/external/frodo976aes.c | 2 +- src/kem/frodokem/external/frodo976shake.c | 2 +- .../external/frodo_macrify_optimized.c | 277 ++++++++++++++++++ 7 files changed, 283 insertions(+), 6 deletions(-) create mode 100644 src/kem/frodokem/external/frodo_macrify_optimized.c diff --git a/src/kem/frodokem/external/frodo1344aes.c b/src/kem/frodokem/external/frodo1344aes.c index d043c215d..2aaefae92 100644 --- a/src/kem/frodokem/external/frodo1344aes.c +++ b/src/kem/frodokem/external/frodo1344aes.c @@ -52,4 +52,4 @@ #include "kem.c" #include "noise.c" #include "util.c" -#include "frodo_macrify_reference.c" +#include "frodo_macrify_optimized.c" diff --git a/src/kem/frodokem/external/frodo1344shake.c b/src/kem/frodokem/external/frodo1344shake.c index 5e00a3143..cedaf4f58 100644 --- a/src/kem/frodokem/external/frodo1344shake.c +++ b/src/kem/frodokem/external/frodo1344shake.c @@ -52,4 +52,4 @@ #include "kem.c" #include "noise.c" #include "util.c" -#include "frodo_macrify_reference.c" +#include "frodo_macrify_optimized.c" diff --git a/src/kem/frodokem/external/frodo640aes.c b/src/kem/frodokem/external/frodo640aes.c index 53c0e783e..2f7d897d2 100644 --- a/src/kem/frodokem/external/frodo640aes.c +++ b/src/kem/frodokem/external/frodo640aes.c @@ -52,4 +52,4 @@ #include "kem.c" #include "noise.c" #include "util.c" -#include "frodo_macrify_reference.c" +#include "frodo_macrify_optimized.c" diff --git a/src/kem/frodokem/external/frodo640shake.c b/src/kem/frodokem/external/frodo640shake.c index c4c890858..ff6802822 100644 --- a/src/kem/frodokem/external/frodo640shake.c +++ b/src/kem/frodokem/external/frodo640shake.c @@ -52,4 +52,4 @@ #include "kem.c" #include "noise.c" #include "util.c" -#include "frodo_macrify_reference.c" +#include "frodo_macrify_optimized.c" diff --git a/src/kem/frodokem/external/frodo976aes.c b/src/kem/frodokem/external/frodo976aes.c index c33a4306c..b54a4cbfb 100644 --- a/src/kem/frodokem/external/frodo976aes.c +++ b/src/kem/frodokem/external/frodo976aes.c @@ -52,4 +52,4 @@ #include "kem.c" #include "noise.c" #include "util.c" -#include "frodo_macrify_reference.c" +#include "frodo_macrify_optimized.c" diff --git a/src/kem/frodokem/external/frodo976shake.c b/src/kem/frodokem/external/frodo976shake.c index b8f9b3e20..d27ceecf3 100644 --- a/src/kem/frodokem/external/frodo976shake.c +++ b/src/kem/frodokem/external/frodo976shake.c @@ -52,4 +52,4 @@ #include "kem.c" #include "noise.c" #include "util.c" -#include "frodo_macrify_reference.c" +#include "frodo_macrify_optimized.c" diff --git a/src/kem/frodokem/external/frodo_macrify_optimized.c b/src/kem/frodokem/external/frodo_macrify_optimized.c new file mode 100644 index 000000000..d2654c746 --- /dev/null +++ b/src/kem/frodokem/external/frodo_macrify_optimized.c @@ -0,0 +1,277 @@ +/******************************************************************************************** +* FrodoKEM: Learning with Errors Key Encapsulation +* +* Abstract: matrix arithmetic functions used by the KEM +*********************************************************************************************/ + +#include +#include +#include + +#include + +#include "frodo_internal.h" + +int frodo_mul_add_as_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e, const uint8_t *seed_A) +{ // Generate-and-multiply: generate matrix A (N x N) row-wise, multiply by s on the right. + // Inputs: s, e (N x N_BAR) + // Output: out = A*s + e (N x N_BAR) + int i, j, k; + ALIGN_HEADER(32) int16_t a_row[4*PARAMS_N] ALIGN_FOOTER(32) = {0}; + + for (i = 0; i < (PARAMS_N*PARAMS_NBAR); i += 2) { + *((uint32_t*)&out[i]) = *((uint32_t*)&e[i]); + } + +#if defined(USE_AES128_FOR_A) + int16_t a_row_temp[4*PARAMS_N] = {0}; // Take four lines of A at once + uint8_t *aes_key_schedule; + OQS_AES128_load_schedule(seed_A, (void **) &aes_key_schedule, 1); + + for (j = 0; j < PARAMS_N; j += PARAMS_STRIPE_STEP) { + a_row_temp[j + 1 + 0*PARAMS_N] = UINT16_TO_LE(j); // Loading values in the little-endian order + a_row_temp[j + 1 + 1*PARAMS_N] = UINT16_TO_LE(j); + a_row_temp[j + 1 + 2*PARAMS_N] = UINT16_TO_LE(j); + a_row_temp[j + 1 + 3*PARAMS_N] = UINT16_TO_LE(j); + } + + for (i = 0; i < PARAMS_N; i += 4) { + for (j = 0; j < PARAMS_N; j += PARAMS_STRIPE_STEP) { // Go through A, four rows at a time + a_row_temp[j + 0*PARAMS_N] = UINT16_TO_LE(i+0); // Loading values in the little-endian order + a_row_temp[j + 1*PARAMS_N] = UINT16_TO_LE(i+1); + a_row_temp[j + 2*PARAMS_N] = UINT16_TO_LE(i+2); + a_row_temp[j + 3*PARAMS_N] = UINT16_TO_LE(i+3); + } + + OQS_AES128_ECB_enc_sch((uint8_t*)a_row_temp, 4*PARAMS_N*sizeof(int16_t), aes_key_schedule, (uint8_t*)a_row); +#elif defined (USE_SHAKE128_FOR_A) + uint8_t seed_A_separated[2 + BYTES_SEED_A]; + uint16_t* seed_A_origin = (uint16_t*)&seed_A_separated; + memcpy(&seed_A_separated[2], seed_A, BYTES_SEED_A); + for (i = 0; i < PARAMS_N; i += 4) { + seed_A_origin[0] = UINT16_TO_LE(i + 0); + OQS_SHA3_shake128((unsigned char*)(a_row + 0*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); + seed_A_origin[0] = UINT16_TO_LE(i + 1); + OQS_SHA3_shake128((unsigned char*)(a_row + 1*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); + seed_A_origin[0] = UINT16_TO_LE(i + 2); + OQS_SHA3_shake128((unsigned char*)(a_row + 2*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); + seed_A_origin[0] = UINT16_TO_LE(i + 3); + OQS_SHA3_shake128((unsigned char*)(a_row + 3*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); +#endif + for (k = 0; k < 4 * PARAMS_N; k++) { + a_row[k] = LE_TO_UINT16(a_row[k]); + } + for (k = 0; k < PARAMS_NBAR; k++) { + uint16_t sum[4] = {0}; + for (j = 0; j < PARAMS_N; j++) { // Matrix-vector multiplication + uint16_t sp = s[k*PARAMS_N + j]; + sum[0] += a_row[0*PARAMS_N + j] * sp; // Go through four lines with same s + sum[1] += a_row[1*PARAMS_N + j] * sp; + sum[2] += a_row[2*PARAMS_N + j] * sp; + sum[3] += a_row[3*PARAMS_N + j] * sp; + } + out[(i+0)*PARAMS_NBAR + k] += sum[0]; + out[(i+2)*PARAMS_NBAR + k] += sum[2]; + out[(i+1)*PARAMS_NBAR + k] += sum[1]; + out[(i+3)*PARAMS_NBAR + k] += sum[3]; + } + } + +#if defined(USE_AES128_FOR_A) + OQS_AES128_free_schedule(aes_key_schedule); +#endif + return 1; +} + + +int frodo_mul_add_sa_plus_e(uint16_t *out, const uint16_t *s, const uint16_t *e, const uint8_t *seed_A) +{ // Generate-and-multiply: generate matrix A (N x N) column-wise, multiply by s' on the left. + // Inputs: s', e' (N_BAR x N) + // Output: out = s'*A + e' (N_BAR x N) + int i, j, kk; + + for (i = 0; i < (PARAMS_N*PARAMS_NBAR); i += 2) { + *((uint32_t*)&out[i]) = *((uint32_t*)&e[i]); + } + +#if defined(USE_AES128_FOR_A) + int k; + uint16_t a_cols[PARAMS_N*PARAMS_STRIPE_STEP] = {0}; + ALIGN_HEADER(32) uint16_t a_cols_t[PARAMS_N*PARAMS_STRIPE_STEP] ALIGN_FOOTER(32) = {0}; + uint16_t a_cols_temp[PARAMS_N*PARAMS_STRIPE_STEP] = {0}; + uint8_t *aes_key_schedule; + OQS_AES128_load_schedule(seed_A, (void **) &aes_key_schedule, 1); + + for (i = 0, j = 0; i < PARAMS_N; i++, j += PARAMS_STRIPE_STEP) { + a_cols_temp[j] = UINT16_TO_LE(i); // Loading values in the little-endian order + } + + for (kk = 0; kk < PARAMS_N; kk += PARAMS_STRIPE_STEP) { // Go through A's columns, 8 (== PARAMS_STRIPE_STEP) columns at a time. + for (i = 0; i < (PARAMS_N*PARAMS_STRIPE_STEP); i += PARAMS_STRIPE_STEP) { + a_cols_temp[i + 1] = UINT16_TO_LE(kk); // Loading values in the little-endian order + } + + OQS_AES128_ECB_enc_sch((uint8_t*)a_cols_temp, PARAMS_N*PARAMS_STRIPE_STEP*sizeof(int16_t), aes_key_schedule, (uint8_t*)a_cols); + + for (i = 0; i < PARAMS_N; i++) { // Transpose a_cols to have access to it in the column-major order. + for (k = 0; k < PARAMS_STRIPE_STEP; k++) { + a_cols_t[k*PARAMS_N + i] = LE_TO_UINT16(a_cols[i*PARAMS_STRIPE_STEP + k]); + } + } + + for (i = 0; i < PARAMS_NBAR; i++) { + for (k = 0; k < PARAMS_STRIPE_STEP; k += PARAMS_PARALLEL) { + uint16_t sum[PARAMS_PARALLEL] = {0}; + for (j = 0; j < PARAMS_N; j++) { // Matrix-vector multiplication + uint16_t sp = s[i*PARAMS_N + j]; + sum[0] += sp * a_cols_t[(k+0)*PARAMS_N + j]; + sum[1] += sp * a_cols_t[(k+1)*PARAMS_N + j]; + sum[2] += sp * a_cols_t[(k+2)*PARAMS_N + j]; + sum[3] += sp * a_cols_t[(k+3)*PARAMS_N + j]; + } + out[i*PARAMS_N + kk + k + 0] += sum[0]; + out[i*PARAMS_N + kk + k + 2] += sum[2]; + out[i*PARAMS_N + kk + k + 1] += sum[1]; + out[i*PARAMS_N + kk + k + 3] += sum[3]; + } + } + } + OQS_AES128_free_schedule(aes_key_schedule); + +#elif defined (USE_SHAKE128_FOR_A) // SHAKE128 + int t=0; + ALIGN_HEADER(32) uint16_t a_cols[4*PARAMS_N] ALIGN_FOOTER(32) = {0}; + + int k; + uint8_t seed_A_separated[2 + BYTES_SEED_A]; + uint16_t* seed_A_origin = (uint16_t*)&seed_A_separated; + memcpy(&seed_A_separated[2], seed_A, BYTES_SEED_A); + for (kk = 0; kk < PARAMS_N; kk+=4) { + seed_A_origin[0] = UINT16_TO_LE(kk + 0); + OQS_SHA3_shake128((unsigned char*)(a_cols + 0*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); + seed_A_origin[0] = UINT16_TO_LE(kk + 1); + OQS_SHA3_shake128((unsigned char*)(a_cols + 1*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); + seed_A_origin[0] = UINT16_TO_LE(kk + 2); + OQS_SHA3_shake128((unsigned char*)(a_cols + 2*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); + seed_A_origin[0] = UINT16_TO_LE(kk + 3); + OQS_SHA3_shake128((unsigned char*)(a_cols + 3*PARAMS_N), (unsigned long long)(2*PARAMS_N), seed_A_separated, 2 + BYTES_SEED_A); + for (i = 0; i < 4 * PARAMS_N; i++) { + a_cols[i] = LE_TO_UINT16(a_cols[i]); + } + + for (i = 0; i < PARAMS_NBAR; i++) { + uint16_t sum[PARAMS_N] = {0}; + for (j = 0; j < 4; j++) { + uint16_t sp = s[i*PARAMS_N + kk + j]; + for (k = 0; k < PARAMS_N; k++) { // Matrix-vector multiplication + sum[k] += sp * a_cols[(t+j)*PARAMS_N + k]; + } + } + for(k = 0; k < PARAMS_N; k++){ + out[i*PARAMS_N + k] += sum[k]; + } + } + } +#endif + + return 1; +} + + +void frodo_mul_bs(uint16_t *out, const uint16_t *b, const uint16_t *s) +{ // Multiply by s on the right + // Inputs: b (N_BAR x N), s (N x N_BAR) + // Output: out = b*s (N_BAR x N_BAR) + int i, j, k; + + for (i = 0; i < PARAMS_NBAR; i++) { + for (j = 0; j < PARAMS_NBAR; j++) { + out[i*PARAMS_NBAR + j] = 0; + for (k = 0; k < PARAMS_N; k++) { + out[i*PARAMS_NBAR + j] += b[i*PARAMS_N + k] * s[j*PARAMS_N + k]; + } + out[i*PARAMS_NBAR + j] = (uint32_t)(out[i*PARAMS_NBAR + j]) & ((1<>= PARAMS_EXTRACTED_BITS; + pos++; + } + } +} + + +void frodo_key_decode(uint16_t *out, const uint16_t *in) +{ // Decoding + unsigned int i, j, index = 0, npieces_word = 8; + unsigned int nwords = (PARAMS_NBAR * PARAMS_NBAR) / 8; + uint16_t temp, maskex=((uint16_t)1 << PARAMS_EXTRACTED_BITS) -1, maskq =((uint16_t)1 << PARAMS_LOGQ) -1; + uint8_t *pos = (uint8_t*)out; + uint64_t templong; + + for (i = 0; i < nwords; i++) { + templong = 0; + for (j = 0; j < npieces_word; j++) { // temp = floor(in*2^{-11}+0.5) + temp = ((in[index] & maskq) + (1 << (PARAMS_LOGQ - PARAMS_EXTRACTED_BITS - 1))) >> (PARAMS_LOGQ - PARAMS_EXTRACTED_BITS); + templong |= ((uint64_t)(temp & maskex)) << (PARAMS_EXTRACTED_BITS * j); + index++; + } + for(j = 0; j < PARAMS_EXTRACTED_BITS; j++) + pos[i*PARAMS_EXTRACTED_BITS + j] = (templong >> (8*j)) & 0xFF; + } +} \ No newline at end of file