aesni: Partially use separate code paths for different key sizes in CCM

Due to the serial nature of the CBC mac, this brings only a marginal speedup.
This commit is contained in:
Martin Willi 2015-03-26 19:07:21 +01:00
parent 1ae46dfba2
commit d48642b7eb

View File

@ -322,14 +322,15 @@ static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
}
/**
* Generic CCM encryption/ICV generation
* AES-128 CCM encryption/ICV generation
*/
static void encrypt_ccm(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
static void encrypt_ccm128(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i d, t, c, b, state, *bi, *bo;
u_int round, blocks, rem, i;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
build_ctr(this, 1, iv, &b);
@ -339,20 +340,47 @@ static void encrypt_ccm(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
c = _mm_xor_si128(c, this->key->schedule[0]);
t = _mm_xor_si128(state, this->key->schedule[0]);
for (round = 1; round < this->key->rounds; round++)
{
c = _mm_aesenc_si128(c, this->key->schedule[round]);
t = _mm_aesenc_si128(t, this->key->schedule[round]);
}
c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
c = _mm_xor_si128(c, k0);
t = _mm_xor_si128(state, k0);
c = _mm_aesenc_si128(c, k1);
t = _mm_aesenc_si128(t, k1);
c = _mm_aesenc_si128(c, k2);
t = _mm_aesenc_si128(t, k2);
c = _mm_aesenc_si128(c, k3);
t = _mm_aesenc_si128(t, k3);
c = _mm_aesenc_si128(c, k4);
t = _mm_aesenc_si128(t, k4);
c = _mm_aesenc_si128(c, k5);
t = _mm_aesenc_si128(t, k5);
c = _mm_aesenc_si128(c, k6);
t = _mm_aesenc_si128(t, k6);
c = _mm_aesenc_si128(c, k7);
t = _mm_aesenc_si128(t, k7);
c = _mm_aesenc_si128(c, k8);
t = _mm_aesenc_si128(t, k8);
c = _mm_aesenc_si128(c, k9);
t = _mm_aesenc_si128(t, k9);
c = _mm_aesenclast_si128(c, k10);
t = _mm_aesenclast_si128(t, k10);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
@ -368,14 +396,15 @@ static void encrypt_ccm(private_aesni_ccm_t *this,
}
/**
* Generic CCM decryption/ICV generation
* AES-128 CCM decryption/ICV generation
*/
static void decrypt_ccm(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
static void decrypt_ccm128(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i d, t, c, b, state, *bi, *bo;
u_int round, blocks, rem, i;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
build_ctr(this, 1, iv, &b);
@ -385,26 +414,388 @@ static void decrypt_ccm(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
t = _mm_xor_si128(state, this->key->schedule[0]);
for (round = 1; round < this->key->rounds; round++)
{
t = _mm_aesenc_si128(t, this->key->schedule[round]);
}
t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
t = _mm_xor_si128(state, k0);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenclast_si128(t, k10);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, this->key->schedule[0]);
for (round = 1; round < this->key->rounds; round++)
{
c = _mm_aesenc_si128(c, this->key->schedule[round]);
}
c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
c = _mm_xor_si128(c, k0);
c = _mm_aesenc_si128(c, k1);
c = _mm_aesenc_si128(c, k2);
c = _mm_aesenc_si128(c, k3);
c = _mm_aesenc_si128(c, k4);
c = _mm_aesenc_si128(c, k5);
c = _mm_aesenc_si128(c, k6);
c = _mm_aesenc_si128(c, k7);
c = _mm_aesenc_si128(c, k8);
c = _mm_aesenc_si128(c, k9);
c = _mm_aesenclast_si128(c, k10);
state = increment_be(state);
}
if (rem)
{
c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
}
crypt_icv(this, iv, c, icv);
}
/**
* AES-192 CCM encryption/ICV generation
*/
static void encrypt_ccm192(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
build_ctr(this, 1, iv, &b);
state = _mm_load_si128(&b);
blocks = len / AES_BLOCK_SIZE;
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
c = _mm_xor_si128(c, k0);
t = _mm_xor_si128(state, k0);
c = _mm_aesenc_si128(c, k1);
t = _mm_aesenc_si128(t, k1);
c = _mm_aesenc_si128(c, k2);
t = _mm_aesenc_si128(t, k2);
c = _mm_aesenc_si128(c, k3);
t = _mm_aesenc_si128(t, k3);
c = _mm_aesenc_si128(c, k4);
t = _mm_aesenc_si128(t, k4);
c = _mm_aesenc_si128(c, k5);
t = _mm_aesenc_si128(t, k5);
c = _mm_aesenc_si128(c, k6);
t = _mm_aesenc_si128(t, k6);
c = _mm_aesenc_si128(c, k7);
t = _mm_aesenc_si128(t, k7);
c = _mm_aesenc_si128(c, k8);
t = _mm_aesenc_si128(t, k8);
c = _mm_aesenc_si128(c, k9);
t = _mm_aesenc_si128(t, k9);
c = _mm_aesenc_si128(c, k10);
t = _mm_aesenc_si128(t, k10);
c = _mm_aesenc_si128(c, k11);
t = _mm_aesenc_si128(t, k11);
c = _mm_aesenclast_si128(c, k12);
t = _mm_aesenclast_si128(t, k12);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
state = increment_be(state);
}
if (rem)
{
c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
}
crypt_icv(this, iv, c, icv);
}
/**
* AES-192 CCM decryption/ICV generation
*/
static void decrypt_ccm192(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
build_ctr(this, 1, iv, &b);
state = _mm_load_si128(&b);
blocks = len / AES_BLOCK_SIZE;
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
t = _mm_xor_si128(state, k0);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenc_si128(t, k10);
t = _mm_aesenc_si128(t, k11);
t = _mm_aesenclast_si128(t, k12);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, k0);
c = _mm_aesenc_si128(c, k1);
c = _mm_aesenc_si128(c, k2);
c = _mm_aesenc_si128(c, k3);
c = _mm_aesenc_si128(c, k4);
c = _mm_aesenc_si128(c, k5);
c = _mm_aesenc_si128(c, k6);
c = _mm_aesenc_si128(c, k7);
c = _mm_aesenc_si128(c, k8);
c = _mm_aesenc_si128(c, k9);
c = _mm_aesenc_si128(c, k10);
c = _mm_aesenc_si128(c, k11);
c = _mm_aesenclast_si128(c, k12);
state = increment_be(state);
}
if (rem)
{
c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
}
crypt_icv(this, iv, c, icv);
}
/**
* AES-256 CCM encryption/ICV generation
*/
static void encrypt_ccm256(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
build_ctr(this, 1, iv, &b);
state = _mm_load_si128(&b);
blocks = len / AES_BLOCK_SIZE;
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
k13 = this->key->schedule[13];
k14 = this->key->schedule[14];
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
c = _mm_xor_si128(c, k0);
t = _mm_xor_si128(state, k0);
c = _mm_aesenc_si128(c, k1);
t = _mm_aesenc_si128(t, k1);
c = _mm_aesenc_si128(c, k2);
t = _mm_aesenc_si128(t, k2);
c = _mm_aesenc_si128(c, k3);
t = _mm_aesenc_si128(t, k3);
c = _mm_aesenc_si128(c, k4);
t = _mm_aesenc_si128(t, k4);
c = _mm_aesenc_si128(c, k5);
t = _mm_aesenc_si128(t, k5);
c = _mm_aesenc_si128(c, k6);
t = _mm_aesenc_si128(t, k6);
c = _mm_aesenc_si128(c, k7);
t = _mm_aesenc_si128(t, k7);
c = _mm_aesenc_si128(c, k8);
t = _mm_aesenc_si128(t, k8);
c = _mm_aesenc_si128(c, k9);
t = _mm_aesenc_si128(t, k9);
c = _mm_aesenc_si128(c, k10);
t = _mm_aesenc_si128(t, k10);
c = _mm_aesenc_si128(c, k11);
t = _mm_aesenc_si128(t, k11);
c = _mm_aesenc_si128(c, k12);
t = _mm_aesenc_si128(t, k12);
c = _mm_aesenc_si128(c, k13);
t = _mm_aesenc_si128(t, k13);
c = _mm_aesenclast_si128(c, k14);
t = _mm_aesenclast_si128(t, k14);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
state = increment_be(state);
}
if (rem)
{
c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
}
crypt_icv(this, iv, c, icv);
}
/**
* AES-256 CCM decryption/ICV generation
*/
static void decrypt_ccm256(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
build_ctr(this, 1, iv, &b);
state = _mm_load_si128(&b);
blocks = len / AES_BLOCK_SIZE;
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
k13 = this->key->schedule[13];
k14 = this->key->schedule[14];
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
t = _mm_xor_si128(state, k0);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenc_si128(t, k10);
t = _mm_aesenc_si128(t, k11);
t = _mm_aesenc_si128(t, k12);
t = _mm_aesenc_si128(t, k13);
t = _mm_aesenclast_si128(t, k14);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, k0);
c = _mm_aesenc_si128(c, k1);
c = _mm_aesenc_si128(c, k2);
c = _mm_aesenc_si128(c, k3);
c = _mm_aesenc_si128(c, k4);
c = _mm_aesenc_si128(c, k5);
c = _mm_aesenc_si128(c, k6);
c = _mm_aesenc_si128(c, k7);
c = _mm_aesenc_si128(c, k8);
c = _mm_aesenc_si128(c, k9);
c = _mm_aesenc_si128(c, k10);
c = _mm_aesenc_si128(c, k11);
c = _mm_aesenc_si128(c, k12);
c = _mm_aesenc_si128(c, k13);
c = _mm_aesenclast_si128(c, k14);
state = increment_be(state);
}
@ -575,9 +966,23 @@ aesni_ccm_t *aesni_ccm_create(encryption_algorithm_t algo,
.key_size = key_size,
.iv_gen = iv_gen_seq_create(),
.icv_size = icv_size,
.encrypt = encrypt_ccm,
.decrypt = decrypt_ccm,
);
switch (key_size)
{
case 16:
this->encrypt = encrypt_ccm128;
this->decrypt = decrypt_ccm128;
break;
case 24:
this->encrypt = encrypt_ccm192;
this->decrypt = decrypt_ccm192;
break;
case 32:
this->encrypt = encrypt_ccm256;
this->decrypt = decrypt_ccm256;
break;
}
return &this->public;
}