liboqs/scripts/copy_from_upstream/patches/pqcrystals-kyber-avx2-shake-aes.patch

254 lines
9.2 KiB
Diff

c6a44a0dbb6735caf40ad4856063282feab56d98
diff --git a/avx2/indcpa.c b/avx2/indcpa.c
index 926f6e87..b8840863 100644
--- a/avx2/indcpa.c
+++ b/avx2/indcpa.c
@@ -178,7 +178,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*AES256CTR_BLOCKBYTES) buf;
aes256ctr_ctx state;
- aes256ctr_init(&state, seed, 0);
+ aes256ctr_init_key(&state, seed);
for(i=0;i<KYBER_K;i++) {
for(j=0;j<KYBER_K;j++) {
@@ -187,7 +187,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
else
nonce = (i << 8) | j;
- state.n = _mm_loadl_epi64((__m128i *)&nonce);
+ aes256ctr_init_iv_u64(&state, nonce);
aes256ctr_squeezeblocks(buf.coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
buflen = REJ_UNIFORM_AVX_NBLOCKS*AES256CTR_BLOCKBYTES;
ctr = rej_uniform_avx(a[i].vec[j].coeffs, buf.coeffs);
@@ -204,6 +204,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
poly_nttunpack(&a[i].vec[j]);
}
}
+ aes256_ctx_release(&state);
}
#else
#if KYBER_K == 2
@@ -212,7 +213,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
unsigned int ctr0, ctr1, ctr2, ctr3;
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
__m256i f;
- keccakx4_state state;
+ shake128x4incctx state;
f = _mm256_loadu_si256((__m256i *)seed);
_mm256_store_si256(buf[0].vec, f);
@@ -241,6 +242,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
buf[3].coeffs[33] = 1;
}
+ shake128x4_inc_init(&state);
shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
@@ -262,6 +264,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
poly_nttunpack(&a[0].vec[1]);
poly_nttunpack(&a[1].vec[0]);
poly_nttunpack(&a[1].vec[1]);
+ shake128x4_inc_ctx_release(&state);
}
#elif KYBER_K == 3
void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
@@ -269,8 +272,8 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
unsigned int ctr0, ctr1, ctr2, ctr3;
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
__m256i f;
- keccakx4_state state;
- keccak_state state1x;
+ shake128x4incctx state;
+ shake128incctx state1x;
f = _mm256_loadu_si256((__m256i *)seed);
_mm256_store_si256(buf[0].vec, f);
@@ -299,6 +302,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
buf[3].coeffs[33] = 1;
}
+ shake128x4_inc_init(&state);
shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
@@ -364,6 +368,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
ctr2 += rej_uniform(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
ctr3 += rej_uniform(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
}
+ shake128x4_inc_ctx_release(&state);
poly_nttunpack(&a[1].vec[1]);
poly_nttunpack(&a[1].vec[2]);
@@ -374,6 +379,8 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
_mm256_store_si256(buf[0].vec, f);
buf[0].coeffs[32] = 2;
buf[0].coeffs[33] = 2;
+
+ shake128_inc_init(&state1x);
shake128_absorb_once(&state1x, buf[0].coeffs, 34);
shake128_squeezeblocks(buf[0].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state1x);
ctr0 = rej_uniform_avx(a[2].vec[2].coeffs, buf[0].coeffs);
@@ -381,6 +388,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
shake128_squeezeblocks(buf[0].coeffs, 1, &state1x);
ctr0 += rej_uniform(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
}
+ shake128_inc_ctx_release(&state1x);
poly_nttunpack(&a[2].vec[2]);
}
@@ -390,7 +398,8 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
unsigned int i, ctr0, ctr1, ctr2, ctr3;
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE) buf[4];
__m256i f;
- keccakx4_state state;
+ shake128x4incctx state;
+ shake128x4_inc_init(&state);
for(i=0;i<4;i++) {
f = _mm256_loadu_si256((__m256i *)seed);
@@ -442,6 +451,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[32], int transposed)
poly_nttunpack(&a[i].vec[2]);
poly_nttunpack(&a[i].vec[3]);
}
+ shake128x4_inc_ctx_release(&state);
}
#endif
#endif
@@ -476,19 +486,20 @@ void indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
uint64_t nonce = 0;
ALIGNED_UINT8(NOISE_NBLOCKS*AES256CTR_BLOCKBYTES+32) coins; // +32 bytes as required by poly_cbd_eta1
aes256ctr_ctx state;
- aes256ctr_init(&state, noiseseed, nonce++);
+ aes256ctr_init_u64(&state, noiseseed, nonce++);
for(i=0;i<KYBER_K;i++) {
aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state);
- state.n = _mm_loadl_epi64((__m128i *)&nonce);
+ aes256ctr_init_iv_u64(&state, nonce);
nonce += 1;
poly_cbd_eta1(&skpv.vec[i], coins.vec);
}
for(i=0;i<KYBER_K;i++) {
aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state);
- state.n = _mm_loadl_epi64((__m128i *)&nonce);
+ aes256ctr_init_iv_u64(&state, nonce);
nonce += 1;
poly_cbd_eta1(&e.vec[i], coins.vec);
}
+ aes256_ctx_release(&state);
#else
#if KYBER_K == 2
poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, e.vec+0, e.vec+1, noiseseed, 0, 1, 2, 3);
@@ -554,20 +565,22 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
uint64_t nonce = 0;
ALIGNED_UINT8(NOISE_NBLOCKS*AES256CTR_BLOCKBYTES+32) buf; /* +32 bytes as required by poly_cbd_eta1 */
aes256ctr_ctx state;
- aes256ctr_init(&state, coins, nonce++);
+ aes256ctr_init_u64(&state, coins, nonce++);
for(i=0;i<KYBER_K;i++) {
aes256ctr_squeezeblocks(buf.coeffs, NOISE_NBLOCKS, &state);
- state.n = _mm_loadl_epi64((__m128i *)&nonce);
+ aes256ctr_init_iv_u64(&state, nonce);
nonce += 1;
poly_cbd_eta1(&sp.vec[i], buf.vec);
}
for(i=0;i<KYBER_K;i++) {
aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state);
- state.n = _mm_loadl_epi64((__m128i *)&nonce);
+ aes256ctr_init_iv_u64(&state, nonce);
nonce += 1;
poly_cbd_eta2(&ep.vec[i], buf.vec);
}
aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state);
+ aes256_ctx_release(&state);
+
poly_cbd_eta2(&epp, buf.vec);
#else
#if KYBER_K == 2
diff --git a/avx2/poly.c b/avx2/poly.c
index ab148a2..96bad86 100644
--- a/avx2/poly.c
+++ b/avx2/poly.c
@@ -2,6 +2,7 @@
#include <immintrin.h>
#include <string.h>
#include "align.h"
+#include "fips202x4.h"
#include "params.h"
#include "poly.h"
#include "ntt.h"
@@ -360,6 +361,7 @@ void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly * restrict a)
}
}
+#ifndef KYBER_90S
/*************************************************
* Name: poly_getnoise_eta1
*
@@ -397,6 +399,7 @@ void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t non
prf(buf.coeffs, KYBER_ETA2*KYBER_N/4, seed, nonce);
poly_cbd_eta2(r, buf.vec);
}
+#endif
#ifndef KYBER_90S
#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE)
@@ -412,7 +415,7 @@ void poly_getnoise_eta1_4x(poly *r0,
{
ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
__m256i f;
- keccakx4_state state;
+ shake256x4incctx state;
f = _mm256_loadu_si256((__m256i *)seed);
_mm256_store_si256(buf[0].vec, f);
@@ -425,8 +428,10 @@ void poly_getnoise_eta1_4x(poly *r0,
buf[2].coeffs[32] = nonce2;
buf[3].coeffs[32] = nonce3;
+ shake256x4_inc_init(&state);
shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
+ shake256x4_inc_ctx_release(&state);
poly_cbd_eta1(r0, buf[0].vec);
poly_cbd_eta1(r1, buf[1].vec);
@@ -447,7 +452,7 @@ void poly_getnoise_eta1122_4x(poly *r0,
{
ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
__m256i f;
- keccakx4_state state;
+ shake256x4incctx state;
f = _mm256_loadu_si256((__m256i *)seed);
_mm256_store_si256(buf[0].vec, f);
@@ -460,8 +465,10 @@ void poly_getnoise_eta1122_4x(poly *r0,
buf[2].coeffs[32] = nonce2;
buf[3].coeffs[32] = nonce3;
+ shake256x4_inc_init(&state);
shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
+ shake256x4_inc_ctx_release(&state);
poly_cbd_eta1(r0, buf[0].vec);
poly_cbd_eta1(r1, buf[1].vec);
diff --git a/avx2/symmetric.h b/avx2/symmetric.h
index b99fe91..483eabc 100644
--- a/avx2/symmetric.h
+++ b/avx2/symmetric.h
@@ -33,10 +33,10 @@ typedef aes256ctr_ctx xof_state;
#include "fips202.h"
#include "fips202x4.h"
-typedef keccak_state xof_state;
+typedef shake128incctx xof_state;
#define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
-void kyber_shake128_absorb(keccak_state *s,
+void kyber_shake128_absorb(shake128incctx *s,
const uint8_t seed[KYBER_SYMBYTES],
uint8_t x,
uint8_t y);