Updated picnic to v2.2. (#746)

* Updated picnic to v2.2.

* Re-enabled optimizations with clang-9/10

* Integrated commit 9917e3 from Picnic, fixing a bug with 128 bit word loading.

* Removed hardcoded aligment macro for picnic.

* Remove references to now-unused USE_OPTIMIZATIONS.
This commit is contained in:
Christian Paquin 2020-05-07 15:47:34 -04:00 committed by GitHub
parent 17c03a1bd2
commit 216cb1a930
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 562 additions and 348117 deletions

View File

@ -31,7 +31,7 @@ Implementation
--------------
- **Source of implementation:** https://github.com/IAIK/Picnic
- **Implementation version:** https://github.com/IAIK/Picnic/tree/v2.1.2
- **Implementation version:** https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed (v2.2 + bug fix)
- **License:** MIT License
- **Language:** C
- **Constant-time:** Yes

View File

@ -21,11 +21,8 @@ set(SRCS sig_picnic.c
external/io.c
external/lowmc.c
external/lowmc_128_128_20.c
external/lowmc_128_128_182.c
external/lowmc_192_192_284.c
external/lowmc_192_192_30.c
external/lowmc_256_256_38.c
external/lowmc_256_256_363.c
external/mpc_lowmc.c
external/mzd_additional.c
external/picnic.c
@ -39,18 +36,9 @@ set(SRCS sig_picnic.c
external/sha3/KeccakHash.c
external/sha3/KeccakSpongeWidth1600.c)
# TODO: The optimized Picnic code, when
# compiled with clang-9 and clang-10, results
# in signing and verification failures.
if(CMAKE_C_COMPILER_ID MATCHES "Clang" OR OQS_PORTABLE_BUILD)
set(USE_OPTIMIZATIONS OFF)
else()
set(USE_OPTIMIZATIONS ON)
endif()
if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND
OQS_USE_AVX2_INSTRUCTIONS AND
OQS_USE_BMI2_INSTRUCTIONS AND
USE_OPTIMIZATIONS)
OQS_USE_BMI2_INSTRUCTIONS)
set(USE_AVX2 ON)
endif()
@ -81,11 +69,8 @@ target_compile_definitions(picnic PRIVATE PICNIC_STATIC
WITH_LOWMC_192_192_30
WITH_LOWMC_256_256_38
WITH_OPT)
if(NOT WIN32)
target_compile_definitions(picnic PRIVATE HAVE_POSIX_MEMALIGN)
endif()
if(OQS_USE_SSE2_INSTRUCTIONS AND USE_OPTIMIZATIONS)
if(OQS_USE_SSE2_INSTRUCTIONS)
target_compile_definitions(picnic PRIVATE WITH_SSE2)
add_compile_options(-msse2)
endif()

View File

@ -1,7 +1,15 @@
Version 2.2 -- 2020-04-08
---------------------------
* Fix Picnic2 implementation on big endian systems
* Add support for SHA3/SHAKE3 instructions on IBM z.
* Various small improvements and bug fixes.
* Remove LowMC instances with m=1.
Version 2.1.2 -- 2019-10-03
---------------------------
* Enable to build with ZKB++- or KKW-based instances only.
* Add options to build with ZKB++- or KKW-based instances only.
* Fix ARM NEON optimizations.
* Slightly reduce heap usage.
* Remove more unused code.

View File

@ -34,7 +34,7 @@ The cmake based build system supports the following flags:
* ``WITH_MARCH_NATIVE``: Build with -march=native -mtune=native (if supported).
* ``WITH_LTO``: Enable link-time optimization (if supported).
* ``WITH_LOWMC_OPT={OFF,ORKC,OLLE}``: Enable optimized round key computation (ORKC) or optimized linear layer evaluation (OLLE) optimizations.
* ``WITH_LOWMC_M1``: Enable LowMC instances with 1 Sbox minimizing the signature sizes (only useful if built with ``WITH_ZKBPP`` on).
* ``WITH_SHA3_IMPL={opt64,avx2,armv8a-neon,s390-cpacf}``: Select SHA3 implementation opt64 (the default, from Keccak code package), avx2 (for AVX2 capable x86-64 systems, from Keccak code package), armv8a-neon (for NEON capable ARM systems, from Keccak code package), s390-cpacf (for IBM z14 and newer systems supporting SHAKE)
Building on Windows
-------------------

View File

@ -10,9 +10,7 @@
#ifdef HAVE_CONFIG_H
#include <config.h>
#else
/* If cmake checks were not run, define some known values. */
#if !defined(HAVE_SYS_AUXV_H) && defined(__linux__)
#define HAVE_SYS_AUXV_H
#endif
@ -24,7 +22,7 @@
#include "cpu.h"
#if !defined(BUILTIN_CPU_SUPPORTED)
#if !defined(BUILTIN_CPU_SUPPORTED) || defined(BUILTIN_CPU_SUPPORTED_BROKEN_BMI2)
#if defined(__arm__) && defined(HAVE_SYS_AUXV_H) && defined(HAVE_ASM_HWCAP_H)
#include <asm/hwcap.h>
#include <sys/auxv.h>
@ -67,39 +65,18 @@ static unsigned init_caps(void) {
if (max >= 7) {
__cpuidex(regs.data, 7, 0);
if (regs.ebx & ((1 << 5) | (1 << 8))) {
if (regs.ebx & (1 << 5)) {
caps |= CPU_CAP_AVX2;
}
if (regs.ebx & (1 << 8)) {
caps |= CPU_CAP_BMI2;
}
}
return caps;
}
#else
#if defined(SUPERCOP)
// SUPERCOP places a cpuid.h on the include search path before the system
// provided cpuid.h. We hack around that by assuming that cpuid always exists
// and defining __get_cpuid on our own.
static int __get_cpuid(unsigned int leaf, unsigned int* reax, unsigned int* rebx,
unsigned int* recx, unsigned int* redx) {
unsigned int eax, ebx, ecx, edx;
__asm__("cpuid\n" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "0"(leaf & 0x80000000));
if (eax == 0 || eax < leaf) {
return 0;
}
__asm__("cpuid\n" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "0"(leaf));
*reax = eax;
*rebx = ebx;
*recx = ecx;
*redx = edx;
return 1;
}
#else
#include <cpuid.h>
#endif
static unsigned init_caps(void) {
unsigned int caps = 0;
@ -115,9 +92,12 @@ static unsigned init_caps(void) {
}
if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {
if (ebx & ((1 << 5) | (1 << 8))) {
if (ebx & (1 << 5)) {
caps |= CPU_CAP_AVX2;
}
if (ebx & (1 << 8)) {
caps |= CPU_CAP_BMI2;
}
}
return caps;
@ -141,9 +121,6 @@ bool cpu_supports(unsigned int caps) {
cpu_caps = init_caps();
}
return cpu_caps & caps;
return (cpu_caps & caps) == caps;
}
#endif
// OQS note: add a dummy definition to avoid empty translation unit (which might occur with -Werror=pedantic)
typedef int avoid_empty_translation_unit;

View File

@ -10,21 +10,29 @@
#ifndef CPU_H
#define CPU_H
#include "macros.h"
#if defined(__GNUC__) && !(defined(__APPLE__) && (__clang_major__ <= 8)) && \
!defined(__MINGW32__) && !defined(__MINGW64__)
!defined(__MINGW32__) && !defined(__MINGW64__)
#define BUILTIN_CPU_SUPPORTED
#endif
#if !defined(BUILTIN_CPU_SUPPORTED)
#if defined(BUILTIN_CPU_SUPPORTED) && GNUC_CHECK(4, 9) && !GNUC_CHECK(5, 0)
/* gcc 4.9's __builtin_cpu_support does not support "bmi2" */
#define BUILTIN_CPU_SUPPORTED_BROKEN_BMI2
#endif
#if !defined(BUILTIN_CPU_SUPPORTED) || defined(BUILTIN_CPU_SUPPORTED_BROKEN_BMI2)
#include <stdbool.h>
#include "oqs_picnic_macros.h"
/* CPU supports SSE2 */
#define CPU_CAP_SSE2 0x00000001
/* CPU supports popcnt */
#define CPU_CAP_POPCNT 0x00000002
/* CPU supports AVX2 + BMI2 */
/* CPU supports AVX2 */
#define CPU_CAP_AVX2 0x00000004
/* CPU supports BMI2 */
#define CPU_CAP_BMI2 0x00000010
/* CPU supports NEON */
#define CPU_CAP_NEON 0x00000008

View File

@ -12,6 +12,13 @@
#include <stdint.h>
#include "macros.h"
#include "endian_compat.h"
#if defined(WITH_SHAKE_S390_CPACF)
/* use the KIMD/KLMD instructions from CPACF for SHAKE support on S390 */
#include "sha3/s390_cpacf.h"
#else
#if !defined(KeccakP200_excluded)
#define KeccakP200_excluded
#endif
@ -25,11 +32,14 @@
#endif
#if !defined(SUPERCOP)
/* use SHAKE implementation in sha3/ */
#include "sha3/KeccakHash.h"
#if defined(WITH_KECCAK_X4)
/* use the Keccakx4 implementation */
#include "sha3/KeccakHashtimes4.h"
#endif
#else
/* use SUPERCOP implementation */
#include <libkeccak.a.headers/KeccakHash.h>
#if defined(WITH_KECCAK_X4)
/* Keccakx4 is not fully supported by SUPERCOP, so we need to ship it ourselves. */
@ -37,9 +47,6 @@
#endif
#endif
#include "macros.h"
#include "endian_compat.h"
typedef Keccak_HashInstance hash_context ATTR_ALIGNED(32);
/**
@ -58,6 +65,15 @@ static inline void hash_update(hash_context* ctx, const uint8_t* data, size_t si
Keccak_HashUpdate(ctx, data, size << 3);
}
static inline void hash_final(hash_context* ctx) {
Keccak_HashFinal(ctx, NULL);
}
static inline void hash_squeeze(hash_context* ctx, uint8_t* buffer, size_t buflen) {
Keccak_HashSqueeze(ctx, buffer, buflen << 3);
}
#endif
static inline void hash_update_uint16_le(hash_context* ctx, uint16_t data) {
const uint16_t data_le = htole16(data);
hash_update(ctx, (const uint8_t*)&data_le, sizeof(data_le));
@ -69,14 +85,6 @@ static inline void hash_init_prefix(hash_context* ctx, size_t digest_size,
hash_update(ctx, &prefix, sizeof(prefix));
}
static inline void hash_final(hash_context* ctx) {
Keccak_HashFinal(ctx, NULL);
}
static inline void hash_squeeze(hash_context* ctx, uint8_t* buffer, size_t buflen) {
Keccak_HashSqueeze(ctx, buffer, buflen << 3);
}
typedef hash_context kdf_shake_t;
#define kdf_shake_init(ctx, digest_size) hash_init((ctx), (digest_size))
@ -182,4 +190,5 @@ typedef hash_context_x4 kdf_shake_x4_t;
#define kdf_shake_x4_finalize_key(ctx) hash_final_x4((ctx))
#define kdf_shake_x4_get_randomness(ctx, dst, count) hash_squeeze_x4((ctx), (dst), (count))
#define kdf_shake_x4_clear(ctx)
#endif

View File

@ -13,7 +13,6 @@
#include "io.h"
#include "lowmc.h"
#include "mzd_additional.h"
#if defined(WITH_KKW)
#include "picnic2_impl.h"
@ -50,31 +49,6 @@ static void sbox_layer_10_uint64(uint64_t* d) {
*d = sbox_layer_10_bitsliced_uint64(*d);
}
#if defined(WITH_LOWMC_M1)
static uint64_t sbox_layer_1_bitsliced_uint64(uint64_t in) {
// a, b, c
const uint64_t x0s = (in & MASK_X0I_1) << 2;
const uint64_t x1s = (in & MASK_X1I_1) << 1;
const uint64_t x2m = in & MASK_X2I_1;
// (b & c) ^ a
const uint64_t t0 = (x1s & x2m) ^ x0s;
// (c & a) ^ a ^ b
const uint64_t t1 = (x0s & x2m) ^ x0s ^ x1s;
// (a & b) ^ a ^ b ^c
const uint64_t t2 = (x0s & x1s) ^ x0s ^ x1s ^ x2m;
return (in & MASK_MASK_1) ^ (t0 >> 2) ^ (t1 >> 1) ^ t2;
}
/**
* S-box for m = 1
*/
static void sbox_layer_1_uint64(uint64_t* d) {
*d = sbox_layer_1_bitsliced_uint64(*d);
}
#endif
#if defined(WITH_LOWMC_128_128_20)
#include "lowmc_128_128_20.h"
#endif
@ -84,16 +58,8 @@ static void sbox_layer_1_uint64(uint64_t* d) {
#if defined(WITH_LOWMC_256_256_38)
#include "lowmc_256_256_38.h"
#endif
#if defined(WITH_LOWMC_128_128_182)
#include "lowmc_128_128_182.h"
#endif
#if defined(WITH_LOWMC_192_192_284)
#include "lowmc_192_192_284.h"
#endif
#if defined(WITH_LOWMC_256_256_363)
#include "lowmc_256_256_363.h"
#endif
#if !defined(NO_UINT64_FALLBACK)
// uint64 based implementation
#include "lowmc_fns_uint64_L1.h"
#define LOWMC lowmc_uint64_128
@ -108,6 +74,7 @@ static void sbox_layer_1_uint64(uint64_t* d) {
#undef LOWMC
#define LOWMC lowmc_uint64_256
#include "lowmc.c.i"
#endif
#if defined(WITH_OPT)
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -163,11 +130,7 @@ static void sbox_layer_1_uint64(uint64_t* d) {
#endif
lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
#if defined(WITH_LOWMC_M1)
ASSUME(lowmc->m == 10 || lowmc->m == 1);
#else
ASSUME(lowmc->m == 10);
#endif
ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);
#if defined(WITH_OPT)
@ -189,24 +152,6 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return lowmc_s256_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return lowmc_s256_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return lowmc_s256_256_1;
#endif
}
}
#endif
}
#endif
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -227,28 +172,11 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return lowmc_s128_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return lowmc_s128_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return lowmc_s128_256_1;
#endif
}
}
#endif
}
#endif
#endif
#if !defined(NO_UINT64_FALLBACK)
if (lowmc->m == 10) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_20)
@ -266,23 +194,6 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return lowmc_uint64_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return lowmc_uint64_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return lowmc_uint64_256_1;
#endif
}
}
#endif
return NULL;
@ -290,11 +201,7 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
#if defined(WITH_ZKBPP)
lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc) {
#if defined(WITH_LOWMC_M1)
ASSUME(lowmc->m == 10 || lowmc->m == 1);
#else
ASSUME(lowmc->m == 10);
#endif
ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);
#if defined(WITH_OPT)
@ -316,24 +223,6 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return lowmc_s256_128_store_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return lowmc_s256_192_store_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return lowmc_s256_256_store_1;
#endif
}
}
#endif
}
#endif
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -354,28 +243,11 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return lowmc_s128_128_store_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return lowmc_s128_192_store_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return lowmc_s128_256_store_1;
#endif
}
}
#endif
}
#endif
#endif
#if !defined(NO_UINT64_FALLBACK)
if (lowmc->m == 10) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_20)
@ -393,23 +265,6 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return lowmc_uint64_128_store_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return lowmc_uint64_192_store_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return lowmc_uint64_256_store_1;
#endif
}
}
#endif
return NULL;
@ -418,11 +273,7 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc
#if defined(WITH_KKW)
lowmc_compute_aux_implementation_f lowmc_compute_aux_get_implementation(const lowmc_t* lowmc) {
#if defined(WITH_LOWMC_M1)
ASSUME(lowmc->m == 10 || lowmc->m == 1);
#else
ASSUME(lowmc->m == 10);
#endif
ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);
#if defined(WITH_OPT)
@ -468,6 +319,7 @@ lowmc_compute_aux_implementation_f lowmc_compute_aux_get_implementation(const lo
#endif
#endif
#if !defined(NO_UINT64_FALLBACK)
if (lowmc->m == 10) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_20)
@ -484,6 +336,7 @@ lowmc_compute_aux_implementation_f lowmc_compute_aux_get_implementation(const lo
#endif
}
}
#endif
return NULL;
}

View File

@ -7,18 +7,10 @@
* SPDX-License-Identifier: MIT
*/
#if defined(LOWMC_INSTANCE_10)
#define LOWMC_INSTANCE LOWMC_INSTANCE_10
#if defined(LOWMC_INSTANCE)
#define LOWMC_M 10
#define LOWMC_R LOWMC_R_10
#define MUL_MC MUL_MC_10
#define ADDMUL_R ADDMUL_R_10
#define MUL_Z MUL_Z_10
#define MZD_SHUFFLE CONCAT(SHUFFLE, 30)
#define M_FIXED_10
#define N_LOWMC CONCAT(LOWMC, 10)
#define SBOX(x) sbox_layer_10_uint64(&BLOCK(x, 0)->w64[(LOWMC_N / (sizeof(word) * 8)) - 1])
#define XOR_MC XOR_MC_10
#include "lowmc_impl.c.i"
#if defined(WITH_ZKBPP)
@ -39,55 +31,11 @@
#include "lowmc_impl.c.i"
#endif
#undef LOWMC_INSTANCE
#undef LOWMC_M
#undef LOWMC_R
#undef MUL_MC
#undef ADDMUL_R
#undef MUL_Z
#undef MZD_SHUFFLE
#undef M_FIXED_10
#undef N_LOWMC
#undef RECORD_STATE
#undef PICNIC2_AUX_COMPUTATION
#undef SBOX
#undef XOR_MC
#endif
#if defined(WITH_LOWMC_M1) && defined(LOWMC_INSTANCE_1)
#define LOWMC_INSTANCE LOWMC_INSTANCE_1
#define LOWMC_M 1
#define LOWMC_R LOWMC_R_1
#define MUL_MC MUL_MC_1
#define ADDMUL_R ADDMUL_R_1
#define MUL_Z MUL_Z_1
#define MZD_SHUFFLE CONCAT(SHUFFLE, 3)
#define M_FIXED_1
#define N_LOWMC CONCAT(LOWMC, 1)
#define SBOX(x) sbox_layer_1_uint64(&BLOCK(x, 0)->w64[(LOWMC_N / (sizeof(word) * 8)) - 1])
#define XOR_MC XOR_MC_1
#include "lowmc_impl.c.i"
#if defined(WITH_ZKBPP)
#undef N_LOWMC
#define N_LOWMC CONCAT(LOWMC, store_1)
#define RECORD_STATE
#include "lowmc_impl.c.i"
#endif
#undef LOWMC_INSTANCE
#undef LOWMC_M
#undef LOWMC_R
#undef MUL_MC
#undef ADDMUL_R
#undef MUL_Z
#undef MZD_SHUFFLE
#undef M_FIXED_1
#undef N_LOWMC
#undef RECORD_STATE
#undef PICNIC2_AUX_COMPUTATION
#undef SBOX
#undef XOR_MC
#endif
// vim: ft=c

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +0,0 @@
#ifndef LOWMC_128_128_182_H
#define LOWMC_128_128_182_H
#include "lowmc_pars.h"
extern const lowmc_t lowmc_128_128_182;
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +0,0 @@
#ifndef LOWMC_192_192_284_H
#define LOWMC_192_192_284_H
#include "lowmc_pars.h"
extern const lowmc_t lowmc_192_192_284;
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +0,0 @@
#ifndef LOWMC_256_256_363_H
#define LOWMC_256_256_363_H
#include "lowmc_pars.h"
extern const lowmc_t lowmc_256_256_363;
#endif

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_s128_128
#define MUL mzd_mul_v_s128_128
#define SHUFFLE mzd_shuffle_128
#define SHUFFLE mzd_shuffle_128_30
#define XOR mzd_xor_s128_128
#define COPY mzd_copy_s128_128
#define MUL_MC_1 mzd_mul_v_s128_128_640
#define MUL_MC_10 mzd_mul_v_s128_128_640
#define ADDMUL_R_1 mzd_addmul_v_s128_3_128
#define ADDMUL_R_10 mzd_addmul_v_s128_30_128
#define MUL_Z_1 mzd_mul_v_parity_uint64_128_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_128_30
#define XOR_MC_1 mzd_xor_s128_640
#define XOR_MC_10 mzd_xor_s128_640
#define MUL_MC mzd_mul_v_s128_128_640
#define ADDMUL_R mzd_addmul_v_s128_30_128
#define MUL_Z mzd_mul_v_parity_uint64_128_30
#define XOR_MC mzd_xor_s128_640
#if defined(WITH_LOWMC_128_128_20)
#define LOWMC_INSTANCE_10 lowmc_128_128_20
#endif
#if defined(WITH_LOWMC_128_128_182)
#define LOWMC_INSTANCE_1 lowmc_128_128_182
#define LOWMC_INSTANCE lowmc_128_128_20
#endif
#define LOWMC_N LOWMC_L1_N
#define LOWMC_R_10 LOWMC_L1_R
#define LOWMC_R_1 LOWMC_L1_1_R
#define LOWMC_R LOWMC_L1_R

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_s128_192
#define MUL mzd_mul_v_s128_192
#define SHUFFLE mzd_shuffle_192
#define SHUFFLE mzd_shuffle_192_30
#define XOR mzd_xor_s128_256
#define COPY mzd_copy_s128_256
#define MUL_MC_1 mzd_mul_v_s128_192_896
#define MUL_MC_10 mzd_mul_v_s128_192_1024
#define ADDMUL_R_1 mzd_addmul_v_s128_3_192
#define ADDMUL_R_10 mzd_addmul_v_s128_30_192
#define MUL_Z_1 mzd_mul_v_parity_uint64_192_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_192_30
#define XOR_MC_1 mzd_xor_s128_896
#define XOR_MC_10 mzd_xor_s128_1024
#define MUL_MC mzd_mul_v_s128_192_1024
#define ADDMUL_R mzd_addmul_v_s128_30_192
#define MUL_Z mzd_mul_v_parity_uint64_192_30
#define XOR_MC mzd_xor_s128_1024
#if defined(WITH_LOWMC_192_192_30)
#define LOWMC_INSTANCE_10 lowmc_192_192_30
#endif
#if defined(WITH_LOWMC_192_192_284)
#define LOWMC_INSTANCE_1 lowmc_192_192_284
#define LOWMC_INSTANCE lowmc_192_192_30
#endif
#define LOWMC_N LOWMC_L3_N
#define LOWMC_R_10 LOWMC_L3_R
#define LOWMC_R_1 LOWMC_L3_1_R
#define LOWMC_R LOWMC_L3_R

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_s128_256
#define MUL mzd_mul_v_s128_256
#define SHUFFLE mzd_shuffle_256
#define SHUFFLE mzd_shuffle_256_30
#define XOR mzd_xor_s128_256
#define COPY mzd_copy_s128_256
#define MUL_MC_1 mzd_mul_v_s128_256_1152
#define MUL_MC_10 mzd_mul_v_s128_256_1280
#define ADDMUL_R_1 mzd_addmul_v_s128_3_256
#define ADDMUL_R_10 mzd_addmul_v_s128_30_256
#define MUL_Z_1 mzd_mul_v_parity_uint64_256_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_256_30
#define XOR_MC_1 mzd_xor_s128_1152
#define XOR_MC_10 mzd_xor_s128_1280
#define MUL_MC mzd_mul_v_s128_256_1280
#define ADDMUL_R mzd_addmul_v_s128_30_256
#define MUL_Z mzd_mul_v_parity_uint64_256_30
#define XOR_MC mzd_xor_s128_1280
#if defined(WITH_LOWMC_256_256_38)
#define LOWMC_INSTANCE_10 lowmc_256_256_38
#endif
#if defined(WITH_LOWMC_256_256_363)
#define LOWMC_INSTANCE_1 lowmc_256_256_363
#define LOWMC_INSTANCE lowmc_256_256_38
#endif
#define LOWMC_N LOWMC_L5_N
#define LOWMC_R_10 LOWMC_L5_R
#define LOWMC_R_1 LOWMC_L5_1_R
#define LOWMC_R LOWMC_L5_R

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_s256_128
#define MUL mzd_mul_v_s256_128
#define SHUFFLE mzd_shuffle_pext_128
#define SHUFFLE mzd_shuffle_pext_128_30
#define XOR mzd_xor_s256_128
#define COPY mzd_copy_s256_128
#define MUL_MC_1 mzd_mul_v_s256_128_768
#define MUL_MC_10 mzd_mul_v_s256_128_768
#define ADDMUL_R_1 mzd_addmul_v_s256_3_128
#define ADDMUL_R_10 mzd_addmul_v_s256_30_128
#define MUL_Z_1 mzd_mul_v_parity_uint64_128_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_128_30
#define XOR_MC_1 mzd_xor_s256_768
#define XOR_MC_10 mzd_xor_s256_768
#define MUL_MC mzd_mul_v_s256_128_768
#define ADDMUL_R mzd_addmul_v_s256_30_128
#define MUL_Z mzd_mul_v_parity_uint64_128_30
#define XOR_MC mzd_xor_s256_768
#if defined(WITH_LOWMC_128_128_20)
#define LOWMC_INSTANCE_10 lowmc_128_128_20
#endif
#if defined(WITH_LOWMC_128_128_182)
#define LOWMC_INSTANCE_1 lowmc_128_128_182
#define LOWMC_INSTANCE lowmc_128_128_20
#endif
#define LOWMC_N LOWMC_L1_N
#define LOWMC_R_10 LOWMC_L1_R
#define LOWMC_R_1 LOWMC_L1_1_R
#define LOWMC_R LOWMC_L1_R

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_s256_192
#define MUL mzd_mul_v_s256_192
#define SHUFFLE mzd_shuffle_pext_192
#define SHUFFLE mzd_shuffle_pext_192_30
#define XOR mzd_xor_s256_256
#define COPY mzd_copy_s256_256
#define MUL_MC_1 mzd_mul_v_s256_192_1024
#define MUL_MC_10 mzd_mul_v_s256_192_1024
#define ADDMUL_R_1 mzd_addmul_v_s256_3_192
#define ADDMUL_R_10 mzd_addmul_v_s256_30_192
#define MUL_Z_1 mzd_mul_v_parity_uint64_192_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_192_30
#define XOR_MC_1 mzd_xor_s256_1024
#define XOR_MC_10 mzd_xor_s256_1024
#define MUL_MC mzd_mul_v_s256_192_1024
#define ADDMUL_R mzd_addmul_v_s256_30_192
#define MUL_Z mzd_mul_v_parity_uint64_192_30
#define XOR_MC mzd_xor_s256_1024
#if defined(WITH_LOWMC_192_192_30)
#define LOWMC_INSTANCE_10 lowmc_192_192_30
#endif
#if defined(WITH_LOWMC_192_192_284)
#define LOWMC_INSTANCE_1 lowmc_192_192_284
#define LOWMC_INSTANCE lowmc_192_192_30
#endif
#define LOWMC_N LOWMC_L3_N
#define LOWMC_R_10 LOWMC_L3_R
#define LOWMC_R_1 LOWMC_L3_1_R
#define LOWMC_R LOWMC_L3_R

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_s256_256
#define MUL mzd_mul_v_s256_256
#define SHUFFLE mzd_shuffle_pext_256
#define SHUFFLE mzd_shuffle_pext_256_30
#define XOR mzd_xor_s256_256
#define COPY mzd_copy_s256_256
#define MUL_MC_1 mzd_mul_v_s256_256_1280
#define MUL_MC_10 mzd_mul_v_s256_256_1280
#define ADDMUL_R_1 mzd_addmul_v_s256_3_256
#define ADDMUL_R_10 mzd_addmul_v_s256_30_256
#define MUL_Z_1 mzd_mul_v_parity_uint64_256_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_256_30
#define XOR_MC_1 mzd_xor_s256_1280
#define XOR_MC_10 mzd_xor_s256_1280
#define MUL_MC mzd_mul_v_s256_256_1280
#define ADDMUL_R mzd_addmul_v_s256_30_256
#define MUL_Z mzd_mul_v_parity_uint64_256_30
#define XOR_MC mzd_xor_s256_1280
#if defined(WITH_LOWMC_256_256_38)
#define LOWMC_INSTANCE_10 lowmc_256_256_38
#endif
#if defined(WITH_LOWMC_256_256_363)
#define LOWMC_INSTANCE_1 lowmc_256_256_363
#define LOWMC_INSTANCE lowmc_256_256_38
#endif
#define LOWMC_N LOWMC_L5_N
#define LOWMC_R_10 LOWMC_L5_R
#define LOWMC_R_1 LOWMC_L5_1_R
#define LOWMC_R LOWMC_L5_R

View File

@ -12,24 +12,16 @@
#define ADDMUL mzd_addmul_v_uint64_128
#define MUL mzd_mul_v_uint64_128
#define XOR mzd_xor_uint64_128
#define SHUFFLE mzd_shuffle_128
#define SHUFFLE mzd_shuffle_128_30
#define COPY mzd_copy_uint64_128
#define MUL_MC_1 mzd_mul_v_uint64_128_576
#define MUL_MC_10 mzd_mul_v_uint64_128_640
#define ADDMUL_R_1 mzd_addmul_v_uint64_3_128
#define ADDMUL_R_10 mzd_addmul_v_uint64_30_128
#define MUL_Z_1 mzd_mul_v_parity_uint64_128_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_128_30
#define XOR_MC_1 mzd_xor_uint64_576
#define XOR_MC_10 mzd_xor_uint64_640
#define MUL_MC mzd_mul_v_uint64_128_640
#define ADDMUL_R mzd_addmul_v_uint64_30_128
#define MUL_Z mzd_mul_v_parity_uint64_128_30
#define XOR_MC mzd_xor_uint64_640
#define LOWMC_N LOWMC_L1_N
#define LOWMC_R_10 LOWMC_L1_R
#define LOWMC_R_1 LOWMC_L1_1_R
#if defined(WITH_LOWMC_128_128_20)
#define LOWMC_INSTANCE_10 lowmc_128_128_20
#endif
#if defined(WITH_LOWMC_128_128_182)
#define LOWMC_INSTANCE_1 lowmc_128_128_182
#define LOWMC_INSTANCE lowmc_128_128_20
#endif
#define LOWMC_N LOWMC_L1_N
#define LOWMC_R LOWMC_L1_R

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_uint64_192
#define MUL mzd_mul_v_uint64_192
#define SHUFFLE mzd_shuffle_192
#define SHUFFLE mzd_shuffle_192_30
#define XOR mzd_xor_uint64_192
#define COPY mzd_copy_uint64_192
#define MUL_MC_1 mzd_mul_v_uint64_192_896
#define MUL_MC_10 mzd_mul_v_uint64_192_960
#define ADDMUL_R_1 mzd_addmul_v_uint64_3_192
#define ADDMUL_R_10 mzd_addmul_v_uint64_30_192
#define MUL_Z_1 mzd_mul_v_parity_uint64_192_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_192_30
#define XOR_MC_1 mzd_xor_uint64_896
#define XOR_MC_10 mzd_xor_uint64_960
#define MUL_MC mzd_mul_v_uint64_192_960
#define ADDMUL_R mzd_addmul_v_uint64_30_192
#define MUL_Z mzd_mul_v_parity_uint64_192_30
#define XOR_MC mzd_xor_uint64_960
#define LOWMC_N LOWMC_L3_N
#define LOWMC_R_10 LOWMC_L3_R
#define LOWMC_R_1 LOWMC_L3_1_R
#if defined(WITH_LOWMC_192_192_30)
#define LOWMC_INSTANCE_10 lowmc_192_192_30
#endif
#if defined(WITH_LOWMC_192_192_284)
#define LOWMC_INSTANCE_1 lowmc_192_192_284
#define LOWMC_INSTANCE lowmc_192_192_30
#endif
#define LOWMC_N LOWMC_L3_N
#define LOWMC_R LOWMC_L3_R

View File

@ -11,25 +11,17 @@
#define ADDMUL mzd_addmul_v_uint64_256
#define MUL mzd_mul_v_uint64_256
#define SHUFFLE mzd_shuffle_256
#define SHUFFLE mzd_shuffle_256_30
#define XOR mzd_xor_uint64_256
#define COPY mzd_copy_uint64_256
#define MUL_MC_1 mzd_mul_v_uint64_256_1152
#define MUL_MC_10 mzd_mul_v_uint64_256_1216
#define ADDMUL_R_1 mzd_addmul_v_uint64_3_256
#define ADDMUL_R_10 mzd_addmul_v_uint64_30_256
#define MUL_Z_1 mzd_mul_v_parity_uint64_256_3
#define MUL_Z_10 mzd_mul_v_parity_uint64_256_30
#define XOR_MC_1 mzd_xor_uint64_1152
#define XOR_MC_10 mzd_xor_uint64_1216
#define MUL_MC mzd_mul_v_uint64_256_1216
#define ADDMUL_R mzd_addmul_v_uint64_30_256
#define MUL_Z mzd_mul_v_parity_uint64_256_30
#define XOR_MC mzd_xor_uint64_1216
#define LOWMC_N LOWMC_L5_N
#define LOWMC_R_10 LOWMC_L5_R
#define LOWMC_R_1 LOWMC_L5_1_R
#if defined(WITH_LOWMC_256_256_38)
#define LOWMC_INSTANCE_10 lowmc_256_256_38
#endif
#if defined(WITH_LOWMC_256_256_363)
#define LOWMC_INSTANCE_1 lowmc_256_256_363
#define LOWMC_INSTANCE lowmc_256_256_38
#endif
#define LOWMC_N LOWMC_L5_N
#define LOWMC_R LOWMC_L5_R

View File

@ -9,19 +9,13 @@
#undef ADDMUL
#undef COPY
#undef LOWMC_INSTANCE_1
#undef LOWMC_INSTANCE_10
#undef LOWMC_INSTANCE
#undef LOWMC_N
#undef LOWMC_R_1
#undef LOWMC_R_10
#undef LOWMC_R
#undef MUL
#undef MUL_MC_1
#undef MUL_MC_10
#undef ADDMUL_R_1
#undef ADDMUL_R_10
#undef MUL_Z_1
#undef MUL_Z_10
#undef MUL_MC
#undef ADDMUL_R
#undef MUL_Z
#undef SHUFFLE
#undef XOR
#undef XOR_MC_1
#undef XOR_MC_10
#undef XOR_MC

View File

@ -7,9 +7,6 @@
* SPDX-License-Identifier: MIT
*/
#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION) && !defined(M_FIXED_1) && !defined(M_FIXED_10)
#error "OLLE is only implemented for 1 or 10 Sboxes"
#endif
#if defined(FN_ATTR)
FN_ATTR
@ -26,11 +23,7 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
mzd_local_t x[((LOWMC_N) + 255) / 256];
mzd_local_t y[((LOWMC_N) + 255) / 256];
#if defined(REDUCED_ROUND_KEY_COMPUTATION)
#if defined(M_FIXED_10)
mzd_local_t nl_part[(LOWMC_R * 32 + 255) / 256];
#elif defined(M_FIXED_1)
mzd_local_t nl_part[(((LOWMC_R + 20) / 21) * 64 + 255) / 256];
#endif
mzd_local_t nl_part[(LOWMC_R * 32 + 255) / 256];
#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION) // LOWMC_OPT=OLLE
#if defined(PICNIC2_AUX_COMPUTATION)
@ -56,27 +49,16 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
SBOX(x);
#endif
#if defined(M_FIXED_10)
const word nl = CONST_BLOCK(nl_part, i >> 3)->w64[(i & 0x7) >> 1];
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << (1 - (i & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
#elif defined(M_FIXED_1)
const word nl = CONST_BLOCK(nl_part, i / (4 * 21))->w64[(i % (4 * 21)) / 21];
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << ((20 - (i % 21)) * 3)) & WORD_C(0xE000000000000000);
#endif
MUL_Z(y, x, round->z_matrix);
MZD_SHUFFLE(x, round->r_mask);
SHUFFLE(x, round->r_mask);
ADDMUL_R(y, x, round->r_matrix);
#if defined(M_FIXED_10)
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &=
WORD_C(0x00000003FFFFFFFF); // clear nl part
#elif defined(M_FIXED_1)
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &=
WORD_C(0x1FFFFFFFFFFFFFFF); // clear nl part
#endif
XOR(x, y, x);
}
#if defined(RECORD_STATE)
@ -88,15 +70,9 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
SBOX(x);
unsigned int i = (LOWMC_R - 1);
#if defined(M_FIXED_10)
const word nl = CONST_BLOCK(nl_part, i >> 3)->w64[(i & 0x7) >> 1];
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << (1 - (i & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
#elif defined(M_FIXED_1)
const word nl = CONST_BLOCK(nl_part, i / (4 * 21))->w64[(i % (4 * 21)) / 21];
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << ((20 - (i % 21)) * 3)) & WORD_C(0xE000000000000000);
#endif
MUL(y, x, LOWMC_INSTANCE.zr_matrix);
COPY(x, y);
#endif
@ -122,15 +98,9 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
SBOX(x);
#endif
#if defined(M_FIXED_10)
const word nl = CONST_BLOCK(nl_part, i >> 3)->w64[(i & 0x7) >> 1];
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
#elif defined(M_FIXED_1)
const word nl = CONST_BLOCK(nl_part, i / (4 * 21))->w64[(i % (4 * 21)) / 21];
BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << ((20 - (i % 21)) * 3)) & WORD_C(0xE000000000000000);
#endif
MUL(y, x, round->l_matrix);
COPY(x, y);
}

View File

@ -20,6 +20,8 @@ typedef mzd_local_t lowmc_key_t;
#define MAX_LOWMC_BLOCK_SIZE_BITS (MAX_LOWMC_BLOCK_SIZE * 8)
#define MAX_LOWMC_KEY_SIZE MAX_LOWMC_BLOCK_SIZE
#define MAX_LOWMC_KEY_SIZE_BITS (MAX_LOWMC_KEY_SIZE * 8)
#define MAX_LOWMC_ROUNDS 38
#define MAX_LOWMC_SBOXES 10
/**
* Masks for 10 S-boxes.
@ -29,14 +31,6 @@ typedef mzd_local_t lowmc_key_t;
#define MASK_X2I UINT64_C(0x9249249000000000)
#define MASK_MASK UINT64_C(0x00000003ffffffff)
/**
* Masks for 1 S-box.
*/
#define MASK_X0I_1 UINT64_C(0x2000000000000000)
#define MASK_X1I_1 UINT64_C(0x4000000000000000)
#define MASK_X2I_1 UINT64_C(0x8000000000000000)
#define MASK_MASK_1 UINT64_C(0x1fffffffffffffff)
/**
* LowMC instances
*/
@ -55,21 +49,6 @@ typedef mzd_local_t lowmc_key_t;
#define LOWMC_L5_K LOWMC_L5_N
#define LOWMC_L5_R 38
#define LOWMC_L1_1_N 128
#define LOWMC_L1_1_M 1
#define LOWMC_L1_1_K LOWMC_L1_1_N
#define LOWMC_L1_1_R 182
#define LOWMC_L3_1_N 192
#define LOWMC_L3_1_M 1
#define LOWMC_L3_1_K LOWMC_L3_1_N
#define LOWMC_L3_1_R 284
#define LOWMC_L5_1_N 256
#define LOWMC_L5_1_M 1
#define LOWMC_L5_1_K LOWMC_L5_1_N
#define LOWMC_L5_1_R 363
typedef struct {
#if !defined(REDUCED_ROUND_KEY_COMPUTATION)
const mzd_local_t* k_matrix;

View File

@ -21,11 +21,11 @@
#endif
/* compatibility with clang and other compilers */
#ifndef __has_attribute
#if !defined(__has_attribute)
#define __has_attribute(a) 0
#endif
#ifndef __has_builtin
#if !defined(__has_builtin)
#define __has_builtin(b) 0
#endif
@ -83,8 +83,10 @@
/* note that C11's alignas will only do the job once DR 444 is implemented */
#if GNUC_CHECK(4, 9) || __has_attribute(aligned)
#define ATTR_ALIGNED(i) __attribute__((aligned((i))))
#define HAVE_USEFUL_ATTR_ALIGNED
/* #elif defined(_MSC_VER)
#define ATTR_ALIGNED(i) __declspec(align((i))) */
#define ATTR_ALIGNED(i) __declspec(align((i)))
#define HAVE_USEFUL_ATTR_ALIGNED */
#else
#define ATTR_ALIGNED(i)
#endif
@ -103,7 +105,7 @@
/* assume aligned builtin */
#if GNUC_CHECK(4, 9) || __has_builtin(__builtin_assume_aligned)
#define ASSUME_ALIGNED(p, a) __builtin_assume_aligned((p), (a))
#elif defined(UNREACHABLE)
#elif defined(UNREACHABLE) && defined(HAVE_USEFUL_ATTR_ALIGNED)
#define ASSUME_ALIGNED(p, a) (((((uintptr_t)(p)) % (a)) == 0) ? (p) : (UNREACHABLE, (p)))
#else
#define ASSUME_ALIGNED(p, a) (p)
@ -249,4 +251,10 @@ static inline uint32_t ceil_log2(uint32_t x) {
return 32 - clz(x - 1);
}
#if defined(__WIN32__)
#define SIZET_FMT "%Iu"
#else
#define SIZET_FMT "%zu"
#endif
#endif

View File

@ -141,45 +141,6 @@ static void mpc_and_verify_uint64(uint64_t* res, uint64_t const* first, uint64_t
} \
} while (0)
#define bitsliced_step_1_uint64_1(sc) \
uint64_t r0m[sc]; \
uint64_t r0s[sc]; \
uint64_t r1m[sc]; \
uint64_t r1s[sc]; \
uint64_t r2m[sc]; \
uint64_t x0s[sc]; \
uint64_t x1s[sc]; \
uint64_t x2m[sc]; \
do { \
for (unsigned int m = 0; m < (sc); ++m) { \
const uint64_t inm = in[m]; \
const uint64_t rvecm = rvec[m]; \
\
x0s[m] = (inm & MASK_X0I_1) << 2; \
x1s[m] = (inm & MASK_X1I_1) << 1; \
x2m[m] = inm & MASK_X2I_1; \
\
r0m[m] = rvecm & MASK_X0I_1; \
r1m[m] = rvecm & MASK_X1I_1; \
r2m[m] = rvecm & MASK_X2I_1; \
\
r0s[m] = r0m[m] << 2; \
r1s[m] = r1m[m] << 1; \
} \
} while (0)
#define bitsliced_step_2_uint64_1(sc) \
do { \
for (unsigned int m = 0; m < (sc); ++m) { \
const uint64_t tmp1 = r2m[m] ^ x0s[m]; \
const uint64_t tmp2 = x0s[m] ^ x1s[m]; \
const uint64_t tmp3 = tmp2 ^ r1m[m]; \
const uint64_t tmp4 = tmp2 ^ r0m[m] ^ x2m[m]; \
\
in[m] = (in[m] & MASK_MASK_1) ^ (tmp4) ^ (tmp1 >> 2) ^ (tmp3 >> 1); \
} \
} while (0)
static void mpc_sbox_layer_bitsliced_uint64_10(uint64_t* in, view_t* view, uint64_t const* rvec) {
bitsliced_step_1_uint64_10(SC_PROOF);
@ -201,29 +162,6 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_10(uint64_t* in, view_t* view
bitsliced_step_2_uint64_10(SC_VERIFY);
}
#if defined(WITH_LOWMC_M1)
static void mpc_sbox_layer_bitsliced_uint64_1(uint64_t* in, view_t* view, uint64_t const* rvec) {
bitsliced_step_1_uint64_1(SC_PROOF);
mpc_and_uint64(r0m, x0s, x1s, r2m, view, 0);
mpc_and_uint64(r2m, x1s, x2m, r1s, view, 1);
mpc_and_uint64(r1m, x0s, x2m, r0s, view, 2);
bitsliced_step_2_uint64_1(SC_PROOF - 1);
}
static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
uint64_t const* rvec) {
bitsliced_step_1_uint64_1(SC_VERIFY);
mpc_and_verify_uint64(r0m, x0s, x1s, r2m, view, MASK_X2I_1, 0);
mpc_and_verify_uint64(r2m, x1s, x2m, r1s, view, MASK_X2I_1, 1);
mpc_and_verify_uint64(r1m, x0s, x2m, r0s, view, MASK_X2I_1, 2);
bitsliced_step_2_uint64_1(SC_VERIFY);
}
#endif
#if defined(WITH_LOWMC_128_128_20)
#include "lowmc_128_128_20.h"
#endif
@ -233,15 +171,6 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
#if defined(WITH_LOWMC_256_256_38)
#include "lowmc_256_256_38.h"
#endif
#if defined(WITH_LOWMC_128_128_182)
#include "lowmc_128_128_182.h"
#endif
#if defined(WITH_LOWMC_192_192_284)
#include "lowmc_192_192_284.h"
#endif
#if defined(WITH_LOWMC_256_256_363)
#include "lowmc_256_256_363.h"
#endif
#define SBOX_uint64(sbox, y, x, views, r, n, shares, shares2) \
do { \
@ -259,6 +188,7 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
#define R_uint64 const uint64_t* r = rvec[i].t
#if !defined(NO_UINT64_FALLBACK)
// uint64 based implementation
#include "lowmc_fns_uint64_L1.h"
#define SIGN mpc_lowmc_call_uint64_128
@ -274,6 +204,7 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
#define SIGN mpc_lowmc_call_uint64_256
#define VERIFY mpc_lowmc_call_verify_uint64_256
#include "mpc_lowmc.c.i"
#endif
#if defined(WITH_OPT)
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -329,11 +260,7 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
#endif
zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc) {
#if defined(WITH_LOWMC_M1)
ASSUME(lowmc->m == 10 || lowmc->m == 1);
#else
ASSUME(lowmc->m == 10);
#endif
ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);
#if defined(WITH_OPT)
@ -355,24 +282,6 @@ zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return mpc_lowmc_call_s256_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return mpc_lowmc_call_s256_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return mpc_lowmc_call_s256_256_1;
#endif
}
}
#endif
}
#endif
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -393,28 +302,11 @@ zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return mpc_lowmc_call_s128_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return mpc_lowmc_call_s128_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return mpc_lowmc_call_s128_256_1;
#endif
}
}
#endif
}
#endif
#endif
#if !defined(NO_UINT64_FALLBACK)
if (lowmc->m == 10) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_20)
@ -432,34 +324,13 @@ zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return mpc_lowmc_call_uint64_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return mpc_lowmc_call_uint64_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return mpc_lowmc_call_uint64_256_1;
#endif
}
}
#endif
return NULL;
}
zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const lowmc_t* lowmc) {
#if defined(WITH_LOWMC_M1)
ASSUME(lowmc->m == 10 || lowmc->m == 1);
#else
ASSUME(lowmc->m == 10);
#endif
ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);
#if defined(WITH_OPT)
@ -481,24 +352,6 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return mpc_lowmc_call_verify_s256_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return mpc_lowmc_call_verify_s256_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return mpc_lowmc_call_verify_s256_256_1;
#endif
}
}
#endif
}
#endif
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -519,24 +372,6 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return mpc_lowmc_call_verify_s128_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return mpc_lowmc_call_verify_s128_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return mpc_lowmc_call_verify_s128_256_1;
#endif
}
}
#endif
}
#endif
#if defined(WITH_NEON)
@ -557,28 +392,11 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return mpc_lowmc_call_verify_s128_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return mpc_lowmc_call_verify_s128_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return mpc_lowmc_call_verify_s128_256_1;
#endif
}
}
#endif
}
#endif
#endif
#if !defined(NO_UINT64_FALLBACK)
if (lowmc->m == 10) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_20)
@ -595,29 +413,12 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
#endif
}
}
#if defined(WITH_LOWMC_M1)
if (lowmc->m == 1) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_182)
case 128:
return mpc_lowmc_call_verify_uint64_128_1;
#endif
#if defined(WITH_LOWMC_192_192_284)
case 192:
return mpc_lowmc_call_verify_uint64_192_1;
#endif
#if defined(WITH_LOWMC_256_256_363)
case 256:
return mpc_lowmc_call_verify_uint64_256_1;
#endif
}
}
#endif
return NULL;
}
#if !defined(NO_UINT64_FALLBACK)
static void mzd_share_uint64_128(mzd_local_t* r, const mzd_local_t* v1, const mzd_local_t* v2,
const mzd_local_t* v3) {
mzd_xor_uint64_128(r, v1, v2);
@ -635,6 +436,7 @@ static void mzd_share_uint64_256(mzd_local_t* r, const mzd_local_t* v1, const mz
mzd_xor_uint64_256(r, v1, v2);
mzd_xor_uint64_256(r, r, v3);
}
#endif
#if defined(WITH_OPT)
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -702,6 +504,7 @@ zkbpp_share_implementation_f get_zkbpp_share_implentation(const lowmc_t* lowmc)
#endif
#endif
#if !defined(NO_UINT64_FALLBACK)
switch (lowmc->n) {
case 128:
return mzd_share_uint64_128;
@ -710,4 +513,5 @@ zkbpp_share_implementation_f get_zkbpp_share_implentation(const lowmc_t* lowmc)
default:
return mzd_share_uint64_256;
}
#endif
}

View File

@ -7,50 +7,14 @@
* SPDX-License-Identifier: MIT
*/
#if defined(LOWMC_INSTANCE_10)
#define M_FIXED_10
#if defined(LOWMC_INSTANCE)
#define N_SIGN CONCAT(SIGN, 10)
#define N_VERIFY CONCAT(VERIFY, 10)
#define MZD_SHUFFLE CONCAT(SHUFFLE, 30)
#define ADDMUL_R ADDMUL_R_10
#define MUL_Z MUL_Z_10
#define XOR_MC XOR_MC_10
#define MUL_MC MUL_MC_10
#define LOWMC_R LOWMC_R_10
#define LOWMC_INSTANCE LOWMC_INSTANCE_10
#include "mpc_lowmc_impl.c.i"
#undef ADDMUL_R
#undef MUL_Z
#undef LOWMC_R
#undef LOWMC_INSTANCE
#undef M_FIXED_10
#undef MZD_SHUFFLE
#undef XOR_MC
#undef MUL_MC
#endif
#if defined(WITH_LOWMC_M1) && defined(LOWMC_INSTANCE_1)
#define M_FIXED_1
#define N_SIGN CONCAT(SIGN, 1)
#define N_VERIFY CONCAT(VERIFY, 1)
#define MZD_SHUFFLE CONCAT(SHUFFLE, 3)
#define ADDMUL_R ADDMUL_R_1
#define MUL_Z MUL_Z_1
#define XOR_MC XOR_MC_1
#define MUL_MC MUL_MC_1
#define LOWMC_R LOWMC_R_1
#define LOWMC_INSTANCE LOWMC_INSTANCE_1
#include "mpc_lowmc_impl.c.i"
#undef ADDMUL_R
#undef MUL_Z
#undef LOWMC_R
#undef LOWMC_INSTANCE
#undef M_FIXED_1
#undef MZD_SHUFFLE
#undef XOR_MC
#undef MUL_MC
#endif
#undef N_SIGN
#undef N_VERIFY
#undef SIGN
#undef VERIFY

View File

@ -10,27 +10,17 @@
#define RANDTAPE R_uint64
#define SBOX SBOX_uint64
#if defined(M_FIXED_10)
#define LOWMC_M 10
#undef SBOX_SIGN
#undef SBOX_VERIFY
#define LOWMC_M 10
#define SBOX_SIGN mpc_sbox_layer_bitsliced_uint64_10
#define SBOX_VERIFY mpc_sbox_layer_bitsliced_verify_uint64_10
#elif defined(M_FIXED_1)
#define LOWMC_M 1
#undef SBOX_SIGN
#undef SBOX_VERIFY
#define SBOX_SIGN mpc_sbox_layer_bitsliced_uint64_1
#define SBOX_VERIFY mpc_sbox_layer_bitsliced_verify_uint64_1
#endif
#if defined(FN_ATTR)
FN_ATTR
#endif
static void N_SIGN(mzd_local_t const* p, view_t* views, in_out_shares_t* in_out_shares,
rvec_t* rvec, recorded_state_t* recorded_state) {
rvec_t* rvec, recorded_state_t* recorded_state) {
#define reduced_shares (SC_PROOF - 1)
#define MPC_LOOP_CONST_C(function, result, first, second, sc, c) \
MPC_LOOP_CONST_C_0(function, result, first, second, sc)

View File

@ -7,17 +7,9 @@
* SPDX-License-Identifier: MIT
*/
#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION) && !defined(M_FIXED_1) && !defined(M_FIXED_10)
#error "OLLE is only implemented for 1 or 10 Sboxes"
#endif
lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
#if defined(REDUCED_ROUND_KEY_COMPUTATION)
#if defined(M_FIXED_10)
mzd_local_t nl_part[reduced_shares][(LOWMC_R * 32 + 255) / 256];
#elif defined(M_FIXED_1)
mzd_local_t nl_part[reduced_shares][(((LOWMC_R + 20) / 21) * 64 + 255) / 256];
#endif
#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
MPC_LOOP_CONST_C(XOR, x, x, LOWMC_INSTANCE.precomputed_constant_linear, reduced_shares, ch);
MPC_LOOP_CONST(MUL_MC, nl_part, lowmc_key,
@ -30,28 +22,19 @@ lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
#endif
SBOX(sbox, y, x, views, r, LOWMC_N, shares, reduced_shares);
for (unsigned int k = 0; k < reduced_shares; ++k) {
#if defined(M_FIXED_10)
const word nl = CONST_BLOCK(nl_part[k], i >> 3)->w64[(i & 0x7) >> 1];
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
#elif defined(M_FIXED_1)
const word nl = CONST_BLOCK(nl_part[k], i / (4 * 21))->w64[(i % (4 * 21)) / 21];
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^= (nl << ((20-(i%21))*3)) & WORD_C(0xE000000000000000);
#endif
}
MPC_LOOP_CONST(MUL_Z, x, y, round->z_matrix, reduced_shares);
for(unsigned int k = 0; k < reduced_shares; ++k) {
MZD_SHUFFLE(y[k], round->r_mask);
SHUFFLE(y[k], round->r_mask);
}
MPC_LOOP_CONST(ADDMUL_R, x, y, round->r_matrix, reduced_shares);
for(unsigned int k = 0; k < reduced_shares; ++k) {
#if defined(M_FIXED_10)
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &= WORD_C(0x00000003FFFFFFFF); //clear nl part
#elif defined(M_FIXED_1)
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &= WORD_C(0x1FFFFFFFFFFFFFFF); //clear nl part
#endif
}
MPC_LOOP_SHARED(XOR, x, x, y, reduced_shares);
}
@ -63,14 +46,9 @@ lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
SBOX(sbox, y, x, views, r, LOWMC_N, shares, reduced_shares);
for (unsigned int k = 0; k < reduced_shares; ++k) {
#if defined(M_FIXED_10)
const word nl = CONST_BLOCK(nl_part[k], i >> 3)->w64[(i & 0x7) >> 1];
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
#elif defined(M_FIXED_1)
const word nl = CONST_BLOCK(nl_part[k], i / (4 * 21))->w64[(i % (4 * 21)) / 21];
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^= (nl << ((20-(i%21))*3)) & WORD_C(0xE000000000000000);
#endif
}
MPC_LOOP_CONST(MUL, x, y, LOWMC_INSTANCE.zr_matrix, reduced_shares);
#else
@ -85,14 +63,9 @@ lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
#endif
SBOX(sbox, y, x, views, r, LOWMC_N, shares, reduced_shares);
for (unsigned int k = 0; k < reduced_shares; ++k) {
#if defined(M_FIXED_10)
const word nl = CONST_BLOCK(nl_part[k], i >> 3)->w64[(i & 0x7) >> 1];
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
#elif defined(M_FIXED_1)
const word nl = CONST_BLOCK(nl_part[k], i / (4 * 21))->w64[(i % (4 * 21)) / 21];
BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^= (nl << ((20-(i%21))*3)) & WORD_C(0xE000000000000000);
#endif
}
MPC_LOOP_CONST(MUL, x, y, round->l_matrix, reduced_shares);
}

View File

@ -39,7 +39,7 @@ static_assert(((sizeof(mzd_local_t) + 0x1f) & ~0x1f) == 32, "sizeof mzd_local_t
#endif
static const unsigned int align_bound = 128 / (8 * sizeof(word));
static uint32_t calculate_rowstride(uint32_t width) {
static size_t calculate_rowstride(size_t width) {
// As soon as we hit the AVX bound, use 32 byte alignment. Otherwise use 16
// byte alignment for SSE2 and 128 bit vectors.
if (width > align_bound) {
@ -49,7 +49,7 @@ static uint32_t calculate_rowstride(uint32_t width) {
}
}
static uint32_t calculate_width(uint32_t c) {
static size_t calculate_width(size_t c) {
return (c + sizeof(word) * 8 - 1) / (sizeof(word) * 8);
}
@ -62,8 +62,7 @@ static uint32_t calculate_width(uint32_t c) {
// memory block.
mzd_local_t* mzd_local_init_ex(uint32_t r, uint32_t c, bool clear) {
const uint32_t width = calculate_width(c);
const uint32_t rowstride = calculate_rowstride(width);
const size_t rowstride = calculate_rowstride(calculate_width(c));
const size_t buffer_size = r * rowstride * sizeof(word);
const size_t alloc_size = (buffer_size + 31) & ~31;
@ -84,8 +83,7 @@ void mzd_local_free(mzd_local_t* v) {
}
void mzd_local_init_multiple_ex(mzd_local_t** dst, size_t n, uint32_t r, uint32_t c, bool clear) {
const uint32_t width = calculate_width(c);
const uint32_t rowstride = calculate_rowstride(width);
const size_t rowstride = calculate_rowstride(calculate_width(c));
const size_t buffer_size = r * rowstride * sizeof(word);
const size_t size_per_elem = (buffer_size + 31) & ~31;
@ -193,25 +191,11 @@ void mzd_xor_s128_640(mzd_local_t* res, mzd_local_t const* first, mzd_local_t co
mm128_xor(CONST_BLOCK(first, 2)->w128[0], CONST_BLOCK(second, 2)->w128[0]);
}
ATTR_TARGET_S128
void mzd_xor_s128_896(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 3);
BLOCK(res, 3)->w128[0] =
mm128_xor(CONST_BLOCK(first, 3)->w128[0], CONST_BLOCK(second, 3)->w128[0]);
}
ATTR_TARGET_S128
void mzd_xor_s128_1024(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
}
ATTR_TARGET_S128
void mzd_xor_s128_1152(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
BLOCK(res, 4)->w128[0] =
mm128_xor(CONST_BLOCK(first, 4)->w128[0], CONST_BLOCK(second, 4)->w128[0]);
}
ATTR_TARGET_S128
void mzd_xor_s128_1280(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 5);
@ -282,31 +266,16 @@ void mzd_xor_uint64_256(mzd_local_t* res, mzd_local_t const* first, mzd_local_t
mzd_xor_uint64_block(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
}
void mzd_xor_uint64_576(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 2);
mzd_xor_uint64_block(BLOCK(res, 2), CONST_BLOCK(first, 2), CONST_BLOCK(second, 2), 1);
}
void mzd_xor_uint64_640(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 2);
mzd_xor_uint64_block(BLOCK(res, 2), CONST_BLOCK(first, 2), CONST_BLOCK(second, 2), 2);
}
void mzd_xor_uint64_896(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 3);
mzd_xor_uint64_block(BLOCK(res, 3), CONST_BLOCK(first, 3), CONST_BLOCK(second, 3), 2);
}
void mzd_xor_uint64_960(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 3);
mzd_xor_uint64_block(BLOCK(res, 3), CONST_BLOCK(first, 3), CONST_BLOCK(second, 3), 3);
}
void mzd_xor_uint64_1152(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
mzd_xor_uint64_block(BLOCK(res, 4), CONST_BLOCK(first, 4), CONST_BLOCK(second, 4), 2);
}
void mzd_xor_uint64_1216(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
mzd_xor_uint64_block(BLOCK(res, 4), CONST_BLOCK(first, 4), CONST_BLOCK(second, 4), 3);
@ -368,63 +337,6 @@ void mzd_mul_v_parity_uint64_256_30(mzd_local_t* c, mzd_local_t const* v, mzd_lo
cblock->w64[3] = res;
}
void mzd_mul_v_parity_uint64_128_3(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) {
block_t* cblock = BLOCK(c, 0);
const block_t* vblock = CONST_BLOCK(v, 0);
cblock->w64[0] = 0;
const block_t* Ablock1 = CONST_BLOCK(At, 0);
const block_t* Ablock2 = CONST_BLOCK(At, 1);
const word parity1 =
parity64_uint64((vblock->w64[0] & Ablock1->w64[0]) ^ (vblock->w64[1] & Ablock1->w64[1]));
const word parity2 =
parity64_uint64((vblock->w64[0] & Ablock1->w64[2]) ^ (vblock->w64[1] & Ablock1->w64[3]));
const word parity3 =
parity64_uint64((vblock->w64[0] & Ablock2->w64[0]) ^ (vblock->w64[1] & Ablock2->w64[1]));
cblock->w64[1] = (parity1 | (parity2 << 1) | (parity3 << 2)) << 61;
}
void mzd_mul_v_parity_uint64_192_3(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) {
block_t* cblock = BLOCK(c, 0);
const block_t* vblock = CONST_BLOCK(v, 0);
for (unsigned int j = 0; j < 3; j++) {
cblock->w64[j] = 0;
}
word res = 0;
for (unsigned int i = 3; i; --i) {
const block_t* Ablock = CONST_BLOCK(At, 3 - i);
const word parity =
parity64_uint64((vblock->w64[0] & Ablock->w64[0]) ^ (vblock->w64[1] & Ablock->w64[1]) ^
(vblock->w64[2] & Ablock->w64[2]));
res |= parity << (64 - i);
}
cblock->w64[2] = res;
}
void mzd_mul_v_parity_uint64_256_3(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) {
block_t* cblock = BLOCK(c, 0);
const block_t* vblock = CONST_BLOCK(v, 0);
for (unsigned int j = 0; j < 3; j++) {
cblock->w64[j] = 0;
}
word res = 0;
for (unsigned int i = 3; i; --i) {
const block_t* Ablock = CONST_BLOCK(At, 3 - i);
const word parity =
parity64_uint64((vblock->w64[0] & Ablock->w64[0]) ^ (vblock->w64[1] & Ablock->w64[1]) ^
(vblock->w64[2] & Ablock->w64[2]) ^ (vblock->w64[3] & Ablock->w64[3]));
res |= parity << (64 - i);
}
cblock->w64[3] = res;
}
#if defined(WITH_OPT)
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -573,37 +485,6 @@ void mzd_mul_v_s128_128_640(mzd_local_t* c, mzd_local_t const* v, mzd_local_t co
cblock3->w128[0] = cval[4];
}
ATTR_TARGET_S128
void mzd_mul_v_s128_192_896(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
word128 cval[7] ATTR_ALIGNED(alignof(word128)) = {mm128_zero, mm128_zero, mm128_zero, mm128_zero,
mm128_zero, mm128_zero, mm128_zero};
for (unsigned int w = 3; w; --w, ++vptr) {
word idx = *vptr;
for (unsigned int i = sizeof(word) * 8; i; i -= 1, idx >>= 1, Ablock += 4) {
const word128 mask = mm128_compute_mask(idx, 0);
mm128_xor_mask_region(&cval[0], Ablock[0].w128, mask, 2);
mm128_xor_mask_region(&cval[2], Ablock[1].w128, mask, 2);
mm128_xor_mask_region(&cval[4], Ablock[2].w128, mask, 2);
cval[6] = mm128_xor_mask(cval[6], Ablock[3].w128[0], mask);
}
}
block_t* cblock1 = BLOCK(c, 0);
block_t* cblock2 = BLOCK(c, 1);
block_t* cblock3 = BLOCK(c, 2);
block_t* cblock4 = BLOCK(c, 3);
cblock1->w128[0] = cval[0];
cblock1->w128[1] = cval[1];
cblock2->w128[0] = cval[2];
cblock2->w128[1] = cval[3];
cblock3->w128[0] = cval[4];
cblock3->w128[1] = cval[5];
cblock4->w128[0] = cval[6];
}
ATTR_TARGET_S128
void mzd_mul_v_s128_192_1024(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
@ -636,42 +517,6 @@ void mzd_mul_v_s128_192_1024(mzd_local_t* c, mzd_local_t const* v, mzd_local_t c
cblock4->w128[1] = cval[7];
}
ATTR_TARGET_S128
void mzd_mul_v_s128_256_1152(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
word128 cval[9] ATTR_ALIGNED(alignof(word128)) = {mm128_zero, mm128_zero, mm128_zero,
mm128_zero, mm128_zero, mm128_zero,
mm128_zero, mm128_zero, mm128_zero};
for (unsigned int w = 4; w; --w, ++vptr) {
word idx = *vptr;
for (unsigned int i = sizeof(word) * 8; i; i -= 1, idx >>= 1, Ablock += 5) {
const word128 mask = mm128_compute_mask(idx, 0);
mm128_xor_mask_region(&cval[0], Ablock[0].w128, mask, 2);
mm128_xor_mask_region(&cval[2], Ablock[1].w128, mask, 2);
mm128_xor_mask_region(&cval[4], Ablock[2].w128, mask, 2);
mm128_xor_mask_region(&cval[6], Ablock[3].w128, mask, 2);
cval[8] = mm128_xor_mask(cval[8], Ablock[4].w128[0], mask);
}
}
block_t* cblock1 = BLOCK(c, 0);
block_t* cblock2 = BLOCK(c, 1);
block_t* cblock3 = BLOCK(c, 2);
block_t* cblock4 = BLOCK(c, 3);
block_t* cblock5 = BLOCK(c, 4);
cblock1->w128[0] = cval[0];
cblock1->w128[1] = cval[1];
cblock2->w128[0] = cval[2];
cblock2->w128[1] = cval[3];
cblock3->w128[0] = cval[4];
cblock3->w128[1] = cval[5];
cblock4->w128[0] = cval[6];
cblock4->w128[1] = cval[7];
cblock5->w128[0] = cval[8];
}
ATTR_TARGET_S128
void mzd_mul_v_s128_256_1280(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
@ -729,7 +574,7 @@ void mzd_addmul_v_s256_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t con
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
word256 cval[2] ATTR_ALIGNED(alignof(word256)) = {_mm256_castsi128_si256(cblock->w128[0]),
word256 cval[2] ATTR_ALIGNED(alignof(word256)) = {_mm256_setr_m128i(cblock->w128[0], mm128_zero),
mm256_zero};
for (unsigned int w = 2; w; --w, ++vptr) {
word idx = *vptr;
@ -987,25 +832,6 @@ void mzd_mul_v_uint64_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t cons
mzd_addmul_v_uint64_256(c, v, A);
}
void mzd_mul_v_uint64_128_576(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
clear_uint64_blocks(BLOCK(c, 0), 2);
clear_uint64_block(BLOCK(c, 2), 1);
for (unsigned int w = 2; w; --w, ++vptr) {
word idx = *vptr;
for (unsigned int i = sizeof(word) * 8; i; --i, idx >>= 1, ++Ablock) {
const uint64_t mask = -(idx & 1);
for (unsigned int j = 0; j < 2; ++j, ++Ablock) {
mzd_xor_mask_uint64_block(BLOCK(c, j), Ablock, mask, 4);
}
mzd_xor_mask_uint64_block(BLOCK(c, 2), Ablock, mask, 1);
}
}
}
void mzd_mul_v_uint64_128_640(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
@ -1025,25 +851,6 @@ void mzd_mul_v_uint64_128_640(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
}
}
void mzd_mul_v_uint64_192_896(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
clear_uint64_blocks(BLOCK(c, 0), 3);
clear_uint64_block(BLOCK(c, 3), 2);
for (unsigned int w = 3; w; --w, ++vptr) {
word idx = *vptr;
for (unsigned int i = sizeof(word) * 8; i; --i, idx >>= 1, ++Ablock) {
const uint64_t mask = -(idx & 1);
for (unsigned int j = 0; j < 3; ++j, ++Ablock) {
mzd_xor_mask_uint64_block(BLOCK(c, j), Ablock, mask, 4);
}
mzd_xor_mask_uint64_block(BLOCK(c, 3), Ablock, mask, 2);
}
}
}
void mzd_mul_v_uint64_192_960(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
@ -1063,25 +870,6 @@ void mzd_mul_v_uint64_192_960(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
}
}
void mzd_mul_v_uint64_256_1152(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
clear_uint64_blocks(BLOCK(c, 0), 4);
clear_uint64_block(BLOCK(c, 4), 2);
for (unsigned int w = 4; w; --w, ++vptr) {
word idx = *vptr;
for (unsigned int i = sizeof(word) * 8; i; --i, idx >>= 1, ++Ablock) {
const uint64_t mask = -(idx & 1);
for (unsigned int j = 0; j < 4; ++j, ++Ablock) {
mzd_xor_mask_uint64_block(BLOCK(c, j), Ablock, mask, 4);
}
mzd_xor_mask_uint64_block(BLOCK(c, 4), Ablock, mask, 2);
}
}
}
void mzd_mul_v_uint64_256_1216(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
const word* vptr = CONST_BLOCK(v, 0)->w64;
const block_t* Ablock = CONST_BLOCK(A, 0);
@ -1130,24 +918,6 @@ void mzd_shuffle_256_30(mzd_local_t* x, const word mask) {
mzd_shuffle_30_idx(x, mask, 3);
}
static inline void mzd_shuffle_3_idx(mzd_local_t* x, const word mask, unsigned int idx) {
const word w = CONST_BLOCK(x, 0)->w64[idx];
const word a = extract_bits(w, mask) << 61;
BLOCK(x, 0)->w64[idx] = a | extract_bits(w, ~mask);
}
void mzd_shuffle_128_3(mzd_local_t* x, const word mask) {
mzd_shuffle_3_idx(x, mask, 1);
}
void mzd_shuffle_192_3(mzd_local_t* x, const word mask) {
mzd_shuffle_3_idx(x, mask, 2);
}
void mzd_shuffle_256_3(mzd_local_t* x, const word mask) {
mzd_shuffle_3_idx(x, mask, 3);
}
// no SIMD
void mzd_addmul_v_uint64_30_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
block_t* cblock = BLOCK(c, 0);
@ -1185,44 +955,6 @@ void mzd_addmul_v_uint64_30_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_
}
}
void mzd_addmul_v_uint64_3_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
block_t* cblock = BLOCK(c, 0);
const block_t* Ablock1 = CONST_BLOCK(A, 0);
const block_t* Ablock2 = CONST_BLOCK(A, 1);
const word idx = CONST_BLOCK(v, 0)->w64[1] >> 61;
const uint64_t mask1 = -(idx & 1);
const uint64_t mask2 = -((idx >> 1) & 1);
const uint64_t mask3 = -((idx >> 2) & 1);
for (unsigned int j = 0; j < 2; ++j) {
cblock->w64[j] ^=
(Ablock1->w64[j] & mask1) ^ (Ablock1->w64[j + 2] & mask2) ^ (Ablock2->w64[j] & mask3);
}
}
void mzd_addmul_v_uint64_3_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
block_t* cblock = BLOCK(c, 0);
const block_t* Ablock = CONST_BLOCK(A, 0);
word idx = CONST_BLOCK(v, 0)->w64[2] >> 61;
for (unsigned int i = 3; i; --i, idx >>= 1, ++Ablock) {
const uint64_t mask = -(idx & 1);
mzd_xor_mask_uint64_block(cblock, Ablock, mask, 3);
}
}
void mzd_addmul_v_uint64_3_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
block_t* cblock = BLOCK(c, 0);
const block_t* Ablock = CONST_BLOCK(A, 0);
word idx = CONST_BLOCK(v, 0)->w64[3] >> 61;
for (unsigned int i = 3; i; --i, idx >>= 1, ++Ablock) {
const uint64_t mask = -(idx & 1);
mzd_xor_mask_uint64_block(cblock, Ablock, mask, 4);
}
}
#if defined(WITH_SSE2) || defined(WITH_NEON)
ATTR_TARGET_S128
void mzd_addmul_v_s128_30_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
@ -1270,43 +1002,6 @@ void mzd_addmul_v_s128_30_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
mzd_addmul_v_s128_30_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 34);
}
ATTR_TARGET_S128
void mzd_addmul_v_s128_3_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
block_t* cblock = BLOCK(c, 0);
const block_t* Ablock = CONST_BLOCK(A, 0);
const word idx = CONST_BLOCK(v, 0)->w64[1] >> 61;
word128 cval[2] ATTR_ALIGNED(alignof(word128));
cval[0] = mm128_xor_mask(cblock->w128[0], Ablock[0].w128[0], mm128_compute_mask(idx, 0));
cval[1] = mm128_and(Ablock[0].w128[1], mm128_compute_mask(idx, 1));
cval[0] = mm128_xor_mask(cval[0], Ablock[1].w128[0], mm128_compute_mask(idx, 2));
cblock->w128[0] = mm128_xor(cval[0], cval[1]);
}
ATTR_TARGET_S128
static void mzd_addmul_v_s128_3_256_idx(mzd_local_t* c, mzd_local_t const* A, const word idx) {
block_t* cblock = BLOCK(c, 0);
const block_t* Ablock = CONST_BLOCK(A, 0);
word128 cval[4] ATTR_ALIGNED(alignof(word128)) = {cblock->w128[0], cblock->w128[1], mm128_zero,
mm128_zero};
mm128_xor_mask_region(&cval[0], Ablock[0].w128, mm128_compute_mask(idx, 0), 2);
mm128_xor_mask_region(&cval[2], Ablock[1].w128, mm128_compute_mask(idx, 1), 2);
mm128_xor_mask_region(&cval[0], Ablock[2].w128, mm128_compute_mask(idx, 2), 2);
cblock->w128[0] = mm128_xor(cval[0], cval[2]);
cblock->w128[1] = mm128_xor(cval[1], cval[3]);
}
ATTR_TARGET_S128
void mzd_addmul_v_s128_3_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
mzd_addmul_v_s128_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[2] >> 61);
}
ATTR_TARGET_S128
void mzd_addmul_v_s128_3_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
mzd_addmul_v_s128_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 61);
}
#endif
#if defined(WITH_AVX2)
@ -1318,7 +1013,7 @@ void mzd_addmul_v_s256_30_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
word idx = vblock->w64[1] >> 34;
word256 cval[2] ATTR_ALIGNED(alignof(word256));
cval[0] = mm256_xor_mask(_mm256_castsi128_si256(cblock->w128[0]), Ablock[0].w256,
cval[0] = mm256_xor_mask(_mm256_setr_m128i(cblock->w128[0], mm128_zero), Ablock[0].w256,
mm256_compute_mask_2(idx, 0));
cval[1] = mm256_and(Ablock[1].w256, mm256_compute_mask_2(idx, 2));
idx >>= 4;
@ -1367,41 +1062,6 @@ void mzd_addmul_v_s256_30_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
mzd_addmul_v_s256_30_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 34);
}
ATTR_TARGET_AVX2
void mzd_addmul_v_s256_3_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
block_t* cblock = BLOCK(c, 0);
const block_t* Ablock = CONST_BLOCK(A, 0);
const word idx = CONST_BLOCK(v, 0)->w64[1] >> 61;
word128 cval[2] ATTR_ALIGNED(alignof(word128));
cval[0] = mm128_xor_mask(cblock->w128[0], Ablock[0].w128[0], mm128_compute_mask(idx, 0));
cval[1] = mm128_and(Ablock[0].w128[1], mm128_compute_mask(idx, 1));
cval[0] = mm128_xor_mask(cval[0], Ablock[1].w128[0], mm128_compute_mask(idx, 2));
cblock->w128[0] = mm128_xor(cval[0], cval[1]);
}
ATTR_TARGET_AVX2
static inline void mzd_addmul_v_s256_3_256_idx(mzd_local_t* c, mzd_local_t const* A, const word idx) {
block_t* cblock = BLOCK(c, 0);
const block_t* Ablock = CONST_BLOCK(A, 0);
word256 cval[2] ATTR_ALIGNED(alignof(word256));
cval[0] = mm256_xor_mask(cblock->w256, Ablock[0].w256, mm256_compute_mask(idx, 0));
cval[1] = mm256_and(Ablock[1].w256, mm256_compute_mask(idx, 1));
cval[0] = mm256_xor_mask(cval[0], Ablock[2].w256, mm256_compute_mask(idx, 2));
cblock->w256 = mm256_xor(cval[0], cval[1]);
}
ATTR_TARGET_AVX2
void mzd_addmul_v_s256_3_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
mzd_addmul_v_s256_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[2] >> 61);
}
ATTR_TARGET_AVX2
void mzd_addmul_v_s256_3_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
mzd_addmul_v_s256_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 61);
}
#if !defined(__x86_64__) && !defined(_M_X64)
ATTR_TARGET_AVX2 ATTR_CONST static uint8_t popcount_32(uint32_t value) {
uint64_t result =
@ -1441,27 +1101,5 @@ ATTR_TARGET_AVX2
void mzd_shuffle_pext_256_30(mzd_local_t* x, const word mask) {
mzd_shuffle_pext_30_idx(x, mask, 3);
}
ATTR_TARGET_AVX2
static inline void mzd_shuffle_pext_3_idx(mzd_local_t* x, const word mask, unsigned int idx) {
const word w = CONST_BLOCK(x, 0)->w64[idx];
const word a = _pext_u64(w, mask) << 61;
BLOCK(x, 0)->w64[idx] = a | _pext_u64(w, ~mask);
}
ATTR_TARGET_AVX2
void mzd_shuffle_pext_128_3(mzd_local_t* x, const word mask) {
mzd_shuffle_pext_3_idx(x, mask, 1);
}
ATTR_TARGET_AVX2
void mzd_shuffle_pext_192_3(mzd_local_t* x, const word mask) {
mzd_shuffle_pext_3_idx(x, mask, 2);
}
ATTR_TARGET_AVX2
void mzd_shuffle_pext_256_3(mzd_local_t* x, const word mask) {
mzd_shuffle_pext_3_idx(x, mask, 3);
}
#endif
#endif

View File

@ -77,16 +77,10 @@ void mzd_xor_uint64_192(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_256(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_576(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_640(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_896(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_960(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_1152(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_uint64_1216(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_128(mzd_local_t* res, mzd_local_t const* first,
@ -95,12 +89,8 @@ void mzd_xor_s128_256(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_640(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_896(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_1024(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_1152(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s128_1280(mzd_local_t* res, mzd_local_t const* first,
mzd_local_t const* second) ATTR_NONNULL;
void mzd_xor_s256_128(mzd_local_t* res, mzd_local_t const* first,
@ -120,16 +110,10 @@ void mzd_xor_s256_1280(mzd_local_t* res, mzd_local_t const* first,
void mzd_mul_v_uint64_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_128_576(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_128_640(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_192_896(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_192_960(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_256_1152(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_uint64_256_1216(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* At) ATTR_NONNULL;
void mzd_mul_v_s128_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
@ -137,12 +121,8 @@ void mzd_mul_v_s128_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const*
void mzd_mul_v_s128_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_128_640(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_192_896(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_192_1024(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_256_1152(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s128_256_1280(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_s256_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
@ -165,12 +145,6 @@ void mzd_addmul_v_uint64_30_192(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_30_256(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_3_128(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_3_192(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_uint64_3_256(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
/**
* Use SSE2 or NEON
@ -181,12 +155,6 @@ void mzd_addmul_v_s128_30_192(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_30_256(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_3_128(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_3_192(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s128_3_256(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
/**
* Use AVX2
@ -197,12 +165,6 @@ void mzd_addmul_v_s256_30_192(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_30_256(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_3_128(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_3_192(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_addmul_v_s256_3_256(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
/**
* Compute using parity based algorithm
@ -213,12 +175,6 @@ void mzd_mul_v_parity_uint64_192_30(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_parity_uint64_256_30(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_parity_uint64_128_3(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_parity_uint64_192_3(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
void mzd_mul_v_parity_uint64_256_3(mzd_local_t* c, mzd_local_t const* v,
mzd_local_t const* A) ATTR_NONNULL;
/**
* Compute c + v * A optimized for c and v being vectors.
@ -240,17 +196,11 @@ void mzd_addmul_v_s256_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t con
* Shuffle vector x according to info in mask. Needed for OLLE optimiztaions.
*/
void mzd_shuffle_128_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_128_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_192_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_192_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_256_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_256_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_128_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_128_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_192_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_192_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_256_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
void mzd_shuffle_pext_256_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
#define BLOCK(v, b) ((block_t*)ASSUME_ALIGNED(&(v)[(b)], 32))
#define CONST_BLOCK(v, b) ((const block_t*)ASSUME_ALIGNED(&(v)[(b)], 32))

View File

@ -261,18 +261,6 @@ int PICNIC_CALLING_CONVENTION picnic_verify(const picnic_publickey_t* pk, const
const char* PICNIC_CALLING_CONVENTION picnic_get_param_name(picnic_params_t parameters) {
switch (parameters) {
case Picnic_L1_1_FS:
return "Picnic_L1_1_FS";
case Picnic_L1_1_UR:
return "Picnic_L1_1_UR";
case Picnic_L3_1_FS:
return "Picnic_L3_1_FS";
case Picnic_L3_1_UR:
return "Picnic_L3_1_UR";
case Picnic_L5_1_FS:
return "Picnic_L5_1_FS";
case Picnic_L5_1_UR:
return "Picnic_L5_1_UR";
case Picnic_L1_FS:
return "Picnic_L1_FS";
case Picnic_L1_UR:

View File

@ -88,13 +88,6 @@ typedef enum {
Picnic2_L1_FS, // 7
Picnic2_L3_FS, // 8
Picnic2_L5_FS, // 9
/* Instances with LowMC m=1 */
Picnic_L1_1_FS, // 10
Picnic_L1_1_UR, // 11
Picnic_L3_1_FS, // 12
Picnic_L3_1_UR, // 13
Picnic_L5_1_FS, // 14
Picnic_L5_1_UR, // 15
PARAMETER_SET_MAX_INDEX
} picnic_params_t;

View File

@ -34,12 +34,6 @@
#define LOWMC_MAX_AND_GATES (3 * 38 * 10 + 4) /* Rounded to nearest byte */
#define MAX_AUX_BYTES ((LOWMC_MAX_AND_GATES + LOWMC_MAX_KEY_BITS) / 8 + 1)
#if defined(__WIN32__)
#define SIZET_FMT "%Iu"
#else
#define SIZET_FMT "%zu"
#endif
/* Helper functions */
ATTR_CONST
@ -133,11 +127,11 @@ void sbox_layer_10_uint64_aux(uint64_t* d, randomTape_t* tapes) {
aux_mpc_AND_bitsliced(x0s, x1s, x2m, &ab, &bc, &ca, tapes);
// (b & c) ^ a
const uint64_t t0 = (bc) ^ x0s;
const uint64_t t0 = bc ^ x0s;
// (c & a) ^ a ^ b
const uint64_t t1 = (ca) ^ x0s ^ x1s;
// (a & b) ^ a ^ b ^c
const uint64_t t2 = (ab) ^ x0s ^ x1s ^ x2m;
const uint64_t t1 = ca ^ x0s ^ x1s;
// (a & b) ^ a ^ b ^
const uint64_t t2 = ab ^ x0s ^ x1s ^ x2m;
*d = (in & MASK_MASK) ^ (t0 >> 2) ^ (t1 >> 1) ^ t2;
}
@ -150,9 +144,7 @@ void sbox_layer_10_uint64_aux(uint64_t* d, randomTape_t* tapes) {
static void computeAuxTape(randomTape_t* tapes, const picnic_instance_t* params) {
mzd_local_t* lowmc_key = mzd_local_init_ex(params->lowmc->n, 1, true);
uint8_t temp[32] = {
0,
};
uint8_t temp[32] = {0};
// combine into key shares and calculate lowmc evaluation in plain
for (size_t i = 0; i < params->num_MPC_parties; i++) {
@ -182,8 +174,7 @@ static void commit(uint8_t* digest, const uint8_t* seed, const uint8_t* aux, con
hash_init(&ctx, params->digest_size);
hash_update(&ctx, seed, params->seed_size);
if (aux != NULL) {
size_t tapeLenBytes = params->view_size;
hash_update(&ctx, aux, tapeLenBytes);
hash_update(&ctx, aux, params->view_size);
}
hash_update(&ctx, salt, SALT_SIZE);
hash_update_uint16_le(&ctx, t);
@ -202,7 +193,7 @@ static void commit_x4(uint8_t** digest, const uint8_t** seed, const uint8_t* sal
const uint8_t* salt_ptr[4] = {salt, salt, salt, salt};
hash_update_x4(&ctx, salt_ptr, SALT_SIZE);
hash_update_x4_uint16_le(&ctx, t);
const uint16_t j_arr[4] = {j + 0, j + 1, j + 2, j + 3};
const uint16_t j_arr[4] = {j + 0, j + 1, j + 2, j + 3};
hash_update_x4_uint16s_le(&ctx, j_arr);
hash_final_x4(&ctx);
hash_squeeze_x4(&ctx, digest, params->digest_size);
@ -315,7 +306,6 @@ static size_t bitsToChunks(size_t chunkLenBits, const uint8_t* input, size_t inp
chunks[i] += getBit(input, i * chunkLenBits + j) << j;
assert(chunks[i] < (1 << chunkLenBits));
}
chunks[i] = le16toh(chunks[i]);
}
return chunkCount;

View File

@ -93,7 +93,7 @@ static uint8_t mpc_AND(uint8_t a, uint8_t b, uint64_t mask_a, uint64_t mask_b, r
static void mpc_sbox(mzd_local_t* statein, shares_t* state_masks, randomTape_t* tapes, msgs_t* msgs,
uint8_t* unopenened_msg, const picnic_instance_t* params) {
uint8_t state[32];
uint8_t state[MAX_LOWMC_BLOCK_SIZE];
mzd_to_char_array(state, statein, params->lowmc->n / 8);
for (size_t i = 0; i < params->lowmc->m * 3; i += 3) {
uint8_t a = getBit((uint8_t*)state, i + 2);
@ -173,6 +173,7 @@ static void mpc_xor_masks(shares_t* out, const shares_t* a, const shares_t* b) {
}
#endif
#if !defined(NO_UINT64_FALLBACK)
/* PICNIC2_L1_FS */
#define XOR mzd_xor_uint64_128
#define MPC_MUL mpc_matrix_mul_uint64_128
@ -247,6 +248,7 @@ static void mpc_xor_masks(shares_t* out, const shares_t* a, const shares_t* b) {
#undef LOWMC_R
#undef LOWMC_INSTANCE
#undef SIM_ONLINE
#endif
#if defined(WITH_OPT)
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -413,11 +415,7 @@ static void mpc_xor_masks(shares_t* out, const shares_t* a, const shares_t* b) {
#endif // WITH_OPT
lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t* lowmc) {
#if defined(WITH_LOWMC_M1)
ASSUME(lowmc->m == 10 || lowmc->m == 1);
#else
ASSUME(lowmc->m == 10);
#endif
ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);
#if defined(WITH_OPT)
@ -441,6 +439,7 @@ lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t*
}
}
#endif
#if defined(WITH_SSE2) || defined(WITH_NEON)
if (CPU_SUPPORTS_SSE2 || CPU_SUPPORTS_NEON) {
if (lowmc->m == 10) {
@ -462,6 +461,8 @@ lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t*
}
#endif
#endif
#if !defined(NO_UINT64_FALLBACK)
if (lowmc->m == 10) {
switch (lowmc->n) {
#if defined(WITH_LOWMC_128_128_20)
@ -478,6 +479,7 @@ lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t*
#endif
}
}
#endif
return NULL;
}

View File

@ -13,10 +13,10 @@
#if defined(FN_ATTR)
FN_ATTR
#endif
static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_t* tapes, msgs_t* msgs,
const mzd_local_t* plaintext, const uint32_t* pubKey,
static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_t* tapes,
msgs_t* msgs, const mzd_local_t* plaintext, const uint32_t* pubKey,
const picnic_instance_t* params) {
int ret = 0;
int ret = 0;
mzd_local_t state[((LOWMC_N) + 255) / 256];
shares_t* key_masks = allocateShares(LOWMC_N); // Make a copy to use when computing each round key
shares_t* mask2_shares = allocateShares(LOWMC_N);
@ -34,7 +34,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
shares_t* nl_part_masks = allocateShares(LOWMC_R * 32);
MPC_MUL(state, maskedKey, LOWMC_INSTANCE.k0_matrix,
mask_shares); // roundKey = maskedKey * KMatrix[0]
mask_shares); // roundKey = maskedKey * KMatrix[0]
XOR(state, state, plaintext);
XOR(state, state, LOWMC_INSTANCE.precomputed_constant_linear);
@ -44,7 +44,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
mzd_local_t state2[((LOWMC_N) + 255) / 256];
for (uint32_t r = 0; r < LOWMC_R - 1; r++) {
mpc_sbox(state, mask_shares, tapes, msgs, unopened_msgs, params);
mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r*32 + 2, 30);
mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r * 32 + 2, 30);
const word nl = CONST_BLOCK(nl_part, r >> 3)->w64[(r & 0x7) >> 1];
BLOCK(state, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << (1 - (r & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
@ -61,16 +61,16 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
mpc_xor_masks(mask_shares, mask_shares, mask2_shares);
}
mpc_sbox(state, mask_shares, tapes, msgs, unopened_msgs, params);
mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, (LOWMC_R-1)*32 + 2, 30);
const word nl = CONST_BLOCK(nl_part, (LOWMC_R-1) >> 3)->w64[((LOWMC_R-1) & 0x7) >> 1];
mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, (LOWMC_R - 1) * 32 + 2, 30);
const word nl = CONST_BLOCK(nl_part, (LOWMC_R - 1) >> 3)->w64[((LOWMC_R - 1) & 0x7) >> 1];
BLOCK(state, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << (1 - ((LOWMC_R-1) & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
(nl << (1 - ((LOWMC_R - 1) & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
MPC_MUL(state, state, LOWMC_INSTANCE.zr_matrix,
mask_shares); // state = state * LMatrix (r-1)
#else
for (uint32_t r = 0; r < LOWMC_R; r++) {
mpc_sbox(state, mask_shares, tapes, msgs, unopened_msgs, params);
mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r*32 + 2, 30);
mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r * 32 + 2, 30);
const word nl = CONST_BLOCK(nl_part, r >> 3)->w64[(r & 0x7) >> 1];
BLOCK(state, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
(nl << (1 - (r & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
@ -82,7 +82,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
#else
mzd_local_t roundKey[((LOWMC_N) + 255) / 256];
MPC_MUL(roundKey, maskedKey, LOWMC_INSTANCE.k0_matrix,
mask_shares); // roundKey = maskedKey * KMatrix[0]
mask_shares); // roundKey = maskedKey * KMatrix[0]
XOR(state, roundKey, plaintext);
shares_t* round_key_masks = allocateShares(mask_shares->numWords);
@ -111,7 +111,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
}
uint32_t output[LOWMC_N / 32];
uint32_t outstate[LOWMC_N / 32];
mzd_to_char_array((uint8_t*)outstate, state, LOWMC_N/8);
mzd_to_char_array((uint8_t*)outstate, state, LOWMC_N / 8);
reconstructShares(output, mask_shares);
xor_word_array(output, output, outstate, (LOWMC_N / 32));

View File

@ -271,7 +271,7 @@ static void transpose_64_64_uint64(const uint64_t* in, uint64_t* out) {
// copy in to out and transpose in-place
for (uint32_t i = 0; i < 64; i++) {
out[i] = bswap64(in[i]);
out[i] = htobe64(in[i]);
}
for (uint32_t i = 0; i < logn; i++) {
@ -294,7 +294,7 @@ static void transpose_64_64_uint64(const uint64_t* in, uint64_t* out) {
width /= 2;
}
for (uint32_t i = 0; i < 64; i++) {
out[i] = bswap64(out[i]);
out[i] = be64toh(out[i]);
}
}
@ -329,7 +329,7 @@ static void transpose_64_64_s128(const uint64_t* in, uint64_t* out) {
const uint32_t logn = 6;
// copy in to out and transpose in-place
word128* out128 = (word128*)out;
word128* out128 = (word128*)out;
const word128* in128 = (const word128*)in;
memcpy_bswap64_64_s128(out128, in128);
@ -399,7 +399,7 @@ static void transpose_64_64_s256(const uint64_t* in, uint64_t* out) {
static const uint32_t logn = 6;
const word256* in256 = (const word256*)in;
word256* out256 = (word256*)out;
word256* out256 = (word256*)out;
// copy in to out and swap bytes
memcpy_bswap64_64_s256(out256, in256);
@ -424,8 +424,8 @@ static void transpose_64_64_s256(const uint64_t* in, uint64_t* out) {
width /= 2;
}
{
word128* out128 = (word128*)out;
const word128 mask = mm128_broadcast_u64(TRANSPOSE_MASKS64[4]);
word128* out128 = (word128*)out;
const word128 mask = mm128_broadcast_u64(TRANSPOSE_MASKS64[4]);
for (uint32_t j = 0; j < nswaps; j++) {
for (uint32_t k = 0; k < width; k += 2) {
@ -541,16 +541,16 @@ void copyShares(shares_t* dst, shares_t* src) {
memcpy(dst->shares, src->shares, dst->numWords * sizeof(dst->shares[0]));
}
void mpc_matrix_mul_uint64_128(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (128) / 8;
void mpc_matrix_mul_uint64_128(mzd_local_t* output, const mzd_local_t* vec,
const mzd_local_t* matrix, shares_t* mask_shares) {
const uint32_t rowstride = (128) / (sizeof(word) * 8);
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 128; i++) {
const uint64_t mask_share = mask_shares->shares[128 - 1 - i];
for (uint32_t j = 0; j < 128; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -572,16 +572,16 @@ void mpc_matrix_mul_uint64_128(mzd_local_t* output, const mzd_local_t* vec, cons
freeShares(tmp_mask);
}
void mpc_matrix_mul_uint64_192(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_uint64_192(mzd_local_t* output, const mzd_local_t* vec,
const mzd_local_t* matrix, shares_t* mask_shares) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 192; i++) {
const uint64_t mask_share = mask_shares->shares[192 - 1 - i];
for (uint32_t j = 0; j < 192; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -603,16 +603,16 @@ void mpc_matrix_mul_uint64_192(mzd_local_t* output, const mzd_local_t* vec, cons
copyShares(mask_shares, tmp_mask);
freeShares(tmp_mask);
}
void mpc_matrix_mul_uint64_256(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_uint64_256(mzd_local_t* output, const mzd_local_t* vec,
const mzd_local_t* matrix, shares_t* mask_shares) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 256; i++) {
const uint64_t mask_share = mask_shares->shares[256 - 1 - i];
for (uint32_t j = 0; j < 256; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -636,87 +636,91 @@ void mpc_matrix_mul_uint64_256(mzd_local_t* output, const mzd_local_t* vec, cons
}
#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
void mpc_matrix_mul_z_uint64_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / 8;
void mpc_matrix_mul_z_uint64_128(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 128);
for (size_t i = 0; i < 30; i++) {
uint64_t new_mask_i = 0;
for (uint32_t j = 0; j < 128 / 8; j++) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (128 / 8) - 1 - j];
for (uint32_t j = 0; j < 128; j += 8) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (128 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
new_mask_i ^= mask_shares->shares[j * 8 + 0] & mask1->w64[0];
new_mask_i ^= mask_shares->shares[j * 8 + 1] & mask1->w64[1];
new_mask_i ^= mask_shares->shares[j * 8 + 2] & mask1->w64[2];
new_mask_i ^= mask_shares->shares[j * 8 + 3] & mask1->w64[3];
new_mask_i ^= mask_shares->shares[j * 8 + 4] & mask2->w64[0];
new_mask_i ^= mask_shares->shares[j * 8 + 5] & mask2->w64[1];
new_mask_i ^= mask_shares->shares[j * 8 + 6] & mask2->w64[2];
new_mask_i ^= mask_shares->shares[j * 8 + 7] & mask2->w64[3];
new_mask_i ^= mask_shares->shares[j + 0] & mask1->w64[0];
new_mask_i ^= mask_shares->shares[j + 1] & mask1->w64[1];
new_mask_i ^= mask_shares->shares[j + 2] & mask1->w64[2];
new_mask_i ^= mask_shares->shares[j + 3] & mask1->w64[3];
new_mask_i ^= mask_shares->shares[j + 4] & mask2->w64[0];
new_mask_i ^= mask_shares->shares[j + 5] & mask2->w64[1];
new_mask_i ^= mask_shares->shares[j + 6] & mask2->w64[2];
new_mask_i ^= mask_shares->shares[j + 7] & mask2->w64[3];
}
mask2_shares->shares[30 - 1 - i] = new_mask_i;
}
mzd_mul_v_parity_uint64_128_30(state2, state, matrix);
}
void mpc_matrix_mul_z_uint64_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_z_uint64_192(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 192);
for (size_t i = 0; i < 30; i++) {
uint64_t new_mask_i = 0;
for (uint32_t j = 0; j < 192 / 8; j++) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (192 / 8) - 1 - j];
for (uint32_t j = 0; j < 192; j += 8) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (192 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
new_mask_i ^= mask_shares->shares[j * 8 + 0] & mask1->w64[0];
new_mask_i ^= mask_shares->shares[j * 8 + 1] & mask1->w64[1];
new_mask_i ^= mask_shares->shares[j * 8 + 2] & mask1->w64[2];
new_mask_i ^= mask_shares->shares[j * 8 + 3] & mask1->w64[3];
new_mask_i ^= mask_shares->shares[j * 8 + 4] & mask2->w64[0];
new_mask_i ^= mask_shares->shares[j * 8 + 5] & mask2->w64[1];
new_mask_i ^= mask_shares->shares[j * 8 + 6] & mask2->w64[2];
new_mask_i ^= mask_shares->shares[j * 8 + 7] & mask2->w64[3];
new_mask_i ^= mask_shares->shares[j + 0] & mask1->w64[0];
new_mask_i ^= mask_shares->shares[j + 1] & mask1->w64[1];
new_mask_i ^= mask_shares->shares[j + 2] & mask1->w64[2];
new_mask_i ^= mask_shares->shares[j + 3] & mask1->w64[3];
new_mask_i ^= mask_shares->shares[j + 4] & mask2->w64[0];
new_mask_i ^= mask_shares->shares[j + 5] & mask2->w64[1];
new_mask_i ^= mask_shares->shares[j + 6] & mask2->w64[2];
new_mask_i ^= mask_shares->shares[j + 7] & mask2->w64[3];
}
mask2_shares->shares[30 - 1 - i] = new_mask_i;
}
mzd_mul_v_parity_uint64_192_30(state2, state, matrix);
}
void mpc_matrix_mul_z_uint64_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_z_uint64_256(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 256);
for (size_t i = 0; i < 30; i++) {
uint64_t new_mask_i = 0;
for (uint32_t j = 0; j < 256 / 8; j++) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (256 / 8) - 1 - j];
for (uint32_t j = 0; j < 256; j += 8) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (256 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
new_mask_i ^= mask_shares->shares[j * 8 + 0] & mask1->w64[0];
new_mask_i ^= mask_shares->shares[j * 8 + 1] & mask1->w64[1];
new_mask_i ^= mask_shares->shares[j * 8 + 2] & mask1->w64[2];
new_mask_i ^= mask_shares->shares[j * 8 + 3] & mask1->w64[3];
new_mask_i ^= mask_shares->shares[j * 8 + 4] & mask2->w64[0];
new_mask_i ^= mask_shares->shares[j * 8 + 5] & mask2->w64[1];
new_mask_i ^= mask_shares->shares[j * 8 + 6] & mask2->w64[2];
new_mask_i ^= mask_shares->shares[j * 8 + 7] & mask2->w64[3];
new_mask_i ^= mask_shares->shares[j + 0] & mask1->w64[0];
new_mask_i ^= mask_shares->shares[j + 1] & mask1->w64[1];
new_mask_i ^= mask_shares->shares[j + 2] & mask1->w64[2];
new_mask_i ^= mask_shares->shares[j + 3] & mask1->w64[3];
new_mask_i ^= mask_shares->shares[j + 4] & mask2->w64[0];
new_mask_i ^= mask_shares->shares[j + 5] & mask2->w64[1];
new_mask_i ^= mask_shares->shares[j + 6] & mask2->w64[2];
new_mask_i ^= mask_shares->shares[j + 7] & mask2->w64[3];
}
mask2_shares->shares[30 - 1 - i] = new_mask_i;
}
mzd_mul_v_parity_uint64_256_30(state2, state, matrix);
}
void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / 8;
void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -724,7 +728,7 @@ void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* stat
const uint64_t mask_share = mask_shares->shares[30 - 1 - i];
for (uint32_t j = 0; j < 128; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -745,9 +749,10 @@ void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* stat
freeShares(tmp_mask);
}
void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = 256 / 8;
void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = 256 / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -755,7 +760,7 @@ void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* stat
const uint64_t mask_share = mask_shares->shares[30 - 1 - i];
for (uint32_t j = 0; j < 192; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -776,9 +781,10 @@ void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* stat
freeShares(tmp_mask);
}
void mpc_matrix_addmul_r_uint64_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_addmul_r_uint64_256(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -786,7 +792,7 @@ void mpc_matrix_addmul_r_uint64_256(mzd_local_t* state2, const mzd_local_t* stat
const uint64_t mask_share = mask_shares->shares[30 - 1 - i];
for (uint32_t j = 0; j < 256; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -814,12 +820,12 @@ void mpc_matrix_mul_nl_part_uint64_128(mzd_local_t* nl_part, const mzd_local_t*
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 128; i++) {
const uint64_t key_mask = key_masks->shares[128 - 1 - i];
for (uint32_t j = 0; j < 20 * 32; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
const block_t* mask1 = &nl_part_block_masks[(matrix_byte >> 0) & 0xF];
const block_t* mask2 = &nl_part_block_masks[(matrix_byte >> 4) & 0xF];
@ -843,12 +849,12 @@ void mpc_matrix_mul_nl_part_uint64_192(mzd_local_t* nl_part, const mzd_local_t*
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 192; i++) {
const uint64_t key_mask = key_masks->shares[192 - 1 - i];
for (uint32_t j = 0; j < 30 * 32; j += 8) {
uint8_t matrix_byte = ((const uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
const block_t* mask1 = &nl_part_block_masks[(matrix_byte >> 0) & 0xF];
const block_t* mask2 = &nl_part_block_masks[(matrix_byte >> 4) & 0xF];
@ -872,12 +878,12 @@ void mpc_matrix_mul_nl_part_uint64_256(mzd_local_t* nl_part, const mzd_local_t*
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 256; i++) {
const uint64_t key_mask = key_masks->shares[256 - 1 - i];
for (uint32_t j = 0; j < 38 * 32; j += 8) {
uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
const block_t* mask1 = &nl_part_block_masks[(matrix_byte >> 0) & 0xF];
const block_t* mask2 = &nl_part_block_masks[(matrix_byte >> 4) & 0xF];
@ -902,7 +908,7 @@ void mpc_matrix_mul_nl_part_uint64_256(mzd_local_t* nl_part, const mzd_local_t*
ATTR_TARGET_S128
void mpc_matrix_mul_s128_128(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (128) / 8;
const uint32_t rowstride = (128) / (sizeof(word) * 8);
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 128; i++) {
@ -913,7 +919,7 @@ void mpc_matrix_mul_s128_128(mzd_local_t* output, const mzd_local_t* vec, const
word128* tmp_mask_block = (word128*)tmp_mask->shares;
for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -935,7 +941,7 @@ void mpc_matrix_mul_s128_128(mzd_local_t* output, const mzd_local_t* vec, const
ATTR_TARGET_S128
void mpc_matrix_mul_s128_192(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (256) / 8;
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 192; i++) {
@ -946,7 +952,7 @@ void mpc_matrix_mul_s128_192(mzd_local_t* output, const mzd_local_t* vec, const
word128* tmp_mask_block = (word128*)tmp_mask->shares;
for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -968,7 +974,7 @@ void mpc_matrix_mul_s128_192(mzd_local_t* output, const mzd_local_t* vec, const
ATTR_TARGET_S128
void mpc_matrix_mul_s128_256(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (256) / 8;
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 256; i++) {
@ -979,7 +985,7 @@ void mpc_matrix_mul_s128_256(mzd_local_t* output, const mzd_local_t* vec, const
word128* tmp_mask_block = (word128*)tmp_mask->shares;
for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1000,15 +1006,16 @@ void mpc_matrix_mul_s128_256(mzd_local_t* output, const mzd_local_t* vec, const
#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
ATTR_TARGET_S128
void mpc_matrix_mul_z_s128_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / 8;
void mpc_matrix_mul_z_s128_128(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 128);
for (size_t i = 0; i < 30; i++) {
block_t new_mask_i = {{0, 0, 0, 0}};
word128* tmp_mask_block = (word128*)mask_shares->shares;
for (uint32_t j = 0; j < 128 / 8; j++, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (128 / 8) - 1 - j];
for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (128 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1025,15 +1032,16 @@ void mpc_matrix_mul_z_s128_128(mzd_local_t* state2, const mzd_local_t* state, sh
}
ATTR_TARGET_S128
void mpc_matrix_mul_z_s128_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_z_s128_192(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 192);
for (size_t i = 0; i < 30; i++) {
block_t new_mask_i = {{0, 0, 0, 0}};
word128* tmp_mask_block = (word128*)mask_shares->shares;
for (uint32_t j = 0; j < 192 / 8; j++, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (192 / 8) - 1 - j];
for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (192 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1050,15 +1058,16 @@ void mpc_matrix_mul_z_s128_192(mzd_local_t* state2, const mzd_local_t* state, sh
}
ATTR_TARGET_S128
void mpc_matrix_mul_z_s128_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_z_s128_256(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 256);
for (size_t i = 0; i < 30; i++) {
block_t new_mask_i = {{0, 0, 0, 0}};
word128* tmp_mask_block = (word128*)mask_shares->shares;
for (uint32_t j = 0; j < 256 / 8; j++, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (256 / 8) - 1 - j];
for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (256 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1075,9 +1084,10 @@ void mpc_matrix_mul_z_s128_256(mzd_local_t* state2, const mzd_local_t* state, sh
}
ATTR_TARGET_S128
void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / 8;
void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -1088,7 +1098,7 @@ void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state,
word128* tmp_mask_block = (word128*)tmp_mask->shares;
for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1108,9 +1118,10 @@ void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state,
}
ATTR_TARGET_S128
void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = 256 / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -1121,7 +1132,7 @@ void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state,
word128* tmp_mask_block = (word128*)tmp_mask->shares;
for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix)[(i * rowstride) + (192 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1141,9 +1152,10 @@ void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state,
}
ATTR_TARGET_S128
void mpc_matrix_addmul_r_s128_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_addmul_r_s128_256(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -1154,7 +1166,7 @@ void mpc_matrix_addmul_r_s128_256(mzd_local_t* state2, const mzd_local_t* state,
word128* tmp_mask_block = (word128*)tmp_mask->shares;
for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((const uint8_t*)matrix)[(i * rowstride) + (256 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1181,7 +1193,7 @@ void mpc_matrix_mul_nl_part_s128_128(mzd_local_t* nl_part, const mzd_local_t* ke
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 128; i++) {
const uint64_t key_mask = key_masks->shares[128 - 1 - i];
const block_t mask_share2 = {{key_mask, key_mask, key_mask, key_mask}};
@ -1190,7 +1202,7 @@ void mpc_matrix_mul_nl_part_s128_128(mzd_local_t* nl_part, const mzd_local_t* ke
word128* tmp_mask_block = (word128*)nl_part_masks->shares;
for (uint32_t j = 0; j < 20 * 32; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[0];
mask2 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[1];
@ -1213,7 +1225,7 @@ void mpc_matrix_mul_nl_part_s128_192(mzd_local_t* nl_part, const mzd_local_t* ke
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 192; i++) {
const uint64_t key_mask = key_masks->shares[192 - 1 - i];
const block_t mask_share2 = {{key_mask, key_mask, key_mask, key_mask}};
@ -1222,7 +1234,7 @@ void mpc_matrix_mul_nl_part_s128_192(mzd_local_t* nl_part, const mzd_local_t* ke
word128* tmp_mask_block = (word128*)nl_part_masks->shares;
for (uint32_t j = 0; j < 30 * 32; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[0];
mask2 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[1];
@ -1245,7 +1257,7 @@ void mpc_matrix_mul_nl_part_s128_256(mzd_local_t* nl_part, const mzd_local_t* ke
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 256; i++) {
const uint64_t key_mask = key_masks->shares[256 - 1 - i];
const block_t mask_share2 = {{key_mask, key_mask, key_mask, key_mask}};
@ -1254,7 +1266,7 @@ void mpc_matrix_mul_nl_part_s128_256(mzd_local_t* nl_part, const mzd_local_t* ke
word128* tmp_mask_block = (word128*)nl_part_masks->shares;
for (uint32_t j = 0; j < 38 * 32; j += 8, tmp_mask_block += 4) {
uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[0];
mask2 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[1];
@ -1277,7 +1289,7 @@ void mpc_matrix_mul_nl_part_s128_256(mzd_local_t* nl_part, const mzd_local_t* ke
ATTR_TARGET_AVX2
void mpc_matrix_mul_s256_128(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (128) / 8;
const uint32_t rowstride = (128) / (sizeof(word) * 8);
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 128; i++) {
@ -1288,9 +1300,9 @@ void mpc_matrix_mul_s256_128(mzd_local_t* output, const mzd_local_t* vec, const
word256* tmp_mask_block = (word256*)tmp_mask->shares;
for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix)[(i * rowstride) + (128 - 1 - j) / 8];
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
tmp_mask_block[0] = mm256_xor_mask(tmp_mask_block[0], mask_share2, mask1);
tmp_mask_block[1] = mm256_xor_mask(tmp_mask_block[1], mask_share2, mask2);
@ -1305,7 +1317,7 @@ void mpc_matrix_mul_s256_128(mzd_local_t* output, const mzd_local_t* vec, const
ATTR_TARGET_AVX2
void mpc_matrix_mul_s256_192(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (256) / 8;
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 192; i++) {
@ -1316,9 +1328,9 @@ void mpc_matrix_mul_s256_192(mzd_local_t* output, const mzd_local_t* vec, const
word256* tmp_mask_block = (word256*)tmp_mask->shares;
for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
tmp_mask_block[0] = mm256_xor_mask(tmp_mask_block[0], mask_share2, mask1);
tmp_mask_block[1] = mm256_xor_mask(tmp_mask_block[1], mask_share2, mask2);
@ -1333,7 +1345,7 @@ void mpc_matrix_mul_s256_192(mzd_local_t* output, const mzd_local_t* vec, const
ATTR_TARGET_AVX2
void mpc_matrix_mul_s256_256(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
shares_t* mask_shares) {
const uint32_t rowstride = (256) / 8;
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
for (size_t i = 0; i < 256; i++) {
@ -1344,9 +1356,9 @@ void mpc_matrix_mul_s256_256(mzd_local_t* output, const mzd_local_t* vec, const
word256* tmp_mask_block = (word256*)tmp_mask->shares;
for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
tmp_mask_block[0] = mm256_xor_mask(tmp_mask_block[0], mask_share2, mask1);
tmp_mask_block[1] = mm256_xor_mask(tmp_mask_block[1], mask_share2, mask2);
@ -1360,15 +1372,16 @@ void mpc_matrix_mul_s256_256(mzd_local_t* output, const mzd_local_t* vec, const
#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
ATTR_TARGET_AVX2
void mpc_matrix_mul_z_s256_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / 8;
void mpc_matrix_mul_z_s256_128(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 128);
for (size_t i = 0; i < 30; i++) {
block_t new_mask_i = {{0, 0, 0, 0}};
word256* tmp_mask_block = (word256*)mask_shares->shares;
for (uint32_t j = 0; j < 128 / 8; j++, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (128 / 8) - 1 - j];
for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (128 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1383,15 +1396,16 @@ void mpc_matrix_mul_z_s256_128(mzd_local_t* state2, const mzd_local_t* state, sh
}
ATTR_TARGET_AVX2
void mpc_matrix_mul_z_s256_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_z_s256_192(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 192);
for (size_t i = 0; i < 30; i++) {
block_t new_mask_i = {{0, 0, 0, 0}};
word256* tmp_mask_block = (word256*)mask_shares->shares;
for (uint32_t j = 0; j < 192 / 8; j++, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (192 / 8) - 1 - j];
for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (192 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1406,15 +1420,16 @@ void mpc_matrix_mul_z_s256_192(mzd_local_t* state2, const mzd_local_t* state, sh
}
ATTR_TARGET_AVX2
void mpc_matrix_mul_z_s256_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
const shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_mul_z_s256_256(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, const shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
memset(mask2_shares->shares, 0, sizeof(uint64_t) * 256);
for (size_t i = 0; i < 30; i++) {
block_t new_mask_i = {{0, 0, 0, 0}};
word256* tmp_mask_block = (word256*)mask_shares->shares;
for (uint32_t j = 0; j < 256 / 8; j++, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (256 / 8) - 1 - j];
for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = matrix->w64[i * rowstride + (256 - 1 - j) / 64] >> (56 - (j % 64));
const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1429,10 +1444,11 @@ void mpc_matrix_mul_z_s256_256(mzd_local_t* state2, const mzd_local_t* state, sh
}
ATTR_TARGET_AVX2
void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (128) / 8;
const uint32_t rowstride = (128) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -1443,7 +1459,7 @@ void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state,
word256* tmp_mask_block = (word256*)tmp_mask->shares;
for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
@ -1458,9 +1474,10 @@ void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state,
}
ATTR_TARGET_AVX2
void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = 256 / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -1471,7 +1488,7 @@ void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state,
word256* tmp_mask_block = (word256*)tmp_mask->shares;
for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
@ -1487,9 +1504,10 @@ void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state,
}
ATTR_TARGET_AVX2
void mpc_matrix_addmul_r_s256_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
shares_t* mask_shares, const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / 8;
void mpc_matrix_addmul_r_s256_256(mzd_local_t* state2, const mzd_local_t* state,
shares_t* mask2_shares, shares_t* mask_shares,
const mzd_local_t* matrix) {
const uint32_t rowstride = (256) / (8 * sizeof(word));
shares_t* tmp_mask = allocateShares(mask_shares->numWords);
copyShares(tmp_mask, mask2_shares);
@ -1500,7 +1518,7 @@ void mpc_matrix_addmul_r_s256_256(mzd_local_t* state2, const mzd_local_t* state,
word256* tmp_mask_block = (word256*)tmp_mask->shares;
for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));
mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
@ -1523,7 +1541,7 @@ void mpc_matrix_mul_nl_part_s256_128(mzd_local_t* nl_part, const mzd_local_t* ke
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 128; i++) {
const uint64_t key_mask = key_masks->shares[128 - 1 - i];
const word256 mask_share2 = _mm256_set1_epi64x(key_mask);
@ -1532,7 +1550,7 @@ void mpc_matrix_mul_nl_part_s256_128(mzd_local_t* nl_part, const mzd_local_t* ke
word256* tmp_mask_block = (word256*)nl_part_masks->shares;
for (uint32_t j = 0; j < 20 * 32; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w256;
mask2 = nl_part_block_masks[(matrix_byte >> 4) & 0xf].w256;
@ -1551,7 +1569,7 @@ void mpc_matrix_mul_nl_part_s256_192(mzd_local_t* nl_part, const mzd_local_t* ke
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 192; i++) {
const uint64_t key_mask = key_masks->shares[192 - 1 - i];
const word256 mask_share2 = _mm256_set1_epi64x(key_mask);
@ -1560,7 +1578,7 @@ void mpc_matrix_mul_nl_part_s256_192(mzd_local_t* nl_part, const mzd_local_t* ke
word256* tmp_mask_block = (word256*)nl_part_masks->shares;
for (uint32_t j = 0; j < 30 * 32; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w256;
mask2 = nl_part_block_masks[(matrix_byte >> 4) & 0xf].w256;
@ -1578,7 +1596,7 @@ void mpc_matrix_mul_nl_part_s256_256(mzd_local_t* nl_part, const mzd_local_t* ke
const mzd_local_t* precomputed_nl_matrix,
const mzd_local_t* precomputed_constant_nl,
shares_t* nl_part_masks, const shares_t* key_masks) {
const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / 8;
const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
for (size_t i = 0; i < 256; i++) {
const uint64_t key_mask = key_masks->shares[256 - 1 - i];
const word256 mask_share2 = _mm256_set1_epi64x(key_mask);
@ -1587,7 +1605,7 @@ void mpc_matrix_mul_nl_part_s256_256(mzd_local_t* nl_part, const mzd_local_t* ke
word256* tmp_mask_block = (word256*)nl_part_masks->shares;
for (uint32_t j = 0; j < 38 * 32; j += 8, tmp_mask_block += 2) {
uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);
mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w256;
mask2 = nl_part_block_masks[(matrix_byte >> 4) & 0xf].w256;

View File

@ -293,14 +293,6 @@ static uint64_t uint64_from_bitstream_10(bitstream_t* bs) {
return ((uint64_t)bitstream_get_bits_32(bs, 30)) << (64 - 30);
}
static void uint64_to_bitstream_1(bitstream_t* bs, const uint64_t v) {
bitstream_put_bits_8(bs, v >> (64 - 3), 3);
}
static uint64_t uint64_from_bitstream_1(bitstream_t* bs) {
return ((uint64_t)bitstream_get_bits_8(bs, 3)) << (64 - 3);
}
static void compress_view(uint8_t* dst, const picnic_instance_t* pp, const view_t* views,
const unsigned int idx) {
const size_t num_views = pp->lowmc->r;
@ -310,14 +302,8 @@ static void compress_view(uint8_t* dst, const picnic_instance_t* pp, const view_
bs.position = 0;
const view_t* v = &views[0];
if (pp->lowmc->m == 10) {
for (size_t i = 0; i < num_views; ++i, ++v) {
uint64_to_bitstream_10(&bs, v->t[idx]);
}
} else if (pp->lowmc->m == 1) {
for (size_t i = 0; i < num_views; ++i, ++v) {
uint64_to_bitstream_1(&bs, v->t[idx]);
}
for (size_t i = 0; i < num_views; ++i, ++v) {
uint64_to_bitstream_10(&bs, v->t[idx]);
}
}
@ -330,14 +316,8 @@ static void decompress_view(view_t* views, const picnic_instance_t* pp, const ui
bs.position = 0;
view_t* v = &views[0];
if (pp->lowmc->m == 10) {
for (size_t i = 0; i < num_views; ++i, ++v) {
v->t[idx] = uint64_from_bitstream_10(&bs);
}
} else if (pp->lowmc->m == 1) {
for (size_t i = 0; i < num_views; ++i, ++v) {
v->t[idx] = uint64_from_bitstream_1(&bs);
}
for (size_t i = 0; i < num_views; ++i, ++v) {
v->t[idx] = uint64_from_bitstream_10(&bs);
}
}
@ -350,15 +330,8 @@ static void decompress_random_tape(rvec_t* rvec, const picnic_instance_t* pp, co
bs.position = 0;
rvec_t* rv = &rvec[0];
if (pp->lowmc->m == 10) {
for (size_t i = 0; i < num_views; ++i, ++rv) {
rv->t[idx] = uint64_from_bitstream_10(&bs);
}
} else if (pp->lowmc->m == 1) {
for (size_t i = 0; i < num_views; ++i, ++rv) {
rv->t[idx] = uint64_from_bitstream_1(&bs);
}
for (size_t i = 0; i < num_views; ++i, ++rv) {
rv->t[idx] = uint64_from_bitstream_10(&bs);
}
}

View File

@ -43,26 +43,6 @@ const uint8_t HASH_PREFIX_5 = 5;
#define LOWMC_L5_OR_NULL NULL
#endif
// L1, L3, and L5 lowmc instances with 1 SBOX
#if defined(WITH_LOWMC_128_128_182)
#include "lowmc_128_128_182.h"
#define LOWMC_L1_1_OR_NULL &lowmc_128_128_182
#else
#define LOWMC_L1_1_OR_NULL NULL
#endif
#if defined(WITH_LOWMC_192_192_284)
#include "lowmc_192_192_284.h"
#define LOWMC_L3_1_OR_NULL &lowmc_192_192_284
#else
#define LOWMC_L3_1_OR_NULL NULL
#endif
#if defined(WITH_LOWMC_256_256_363)
#include "lowmc_256_256_363.h"
#define LOWMC_L5_1_OR_NULL &lowmc_256_256_363
#else
#define LOWMC_L5_1_OR_NULL NULL
#endif
#if defined(WITH_ZKBPP)
#define ENABLE_ZKBPP(x) x
#else
@ -81,9 +61,11 @@ const uint8_t HASH_PREFIX_5 = 5;
#elif defined(WITH_ZKBPP)
#define NULL_FNS \
{ NULL, NULL, NULL, NULL, NULL }
#else
#elif defined(WITH_KKW)
#define NULL_FNS \
{ NULL }
{ NULL, NULL, NULL }
#else
#error "At least one of WITH_ZKBPP and WITH_KKW have to be defined!"
#endif
static picnic_instance_t instances[PARAMETER_SET_MAX_INDEX] = {
@ -107,19 +89,7 @@ static picnic_instance_t instances[PARAMETER_SET_MAX_INDEX] = {
PICNIC_SIGNATURE_SIZE_Picnic2_L3_FS, Picnic2_L3_FS, TRANSFORM_FS, NULL_FNS},
{ENABLE_KKW(LOWMC_L5_OR_NULL), 64, 32, 803, 50, 64, 32, 32, 143, 30, 110, 0, 0,
PICNIC_SIGNATURE_SIZE_Picnic2_L5_FS, Picnic2_L5_FS, TRANSFORM_FS, NULL_FNS},
// Picnic with LowMC with m=1
{ENABLE_ZKBPP(LOWMC_L1_1_OR_NULL), 32, 16, 219, 219, 3, 16, 16, 69, 3, 55, 0, 0,
PICNIC_SIGNATURE_SIZE_Picnic_L1_1_FS, Picnic_L1_1_FS, TRANSFORM_FS, NULL_FNS},
{ENABLE_ZKBPP(LOWMC_L1_1_OR_NULL), 32, 16, 219, 219, 3, 16, 16, 69, 3, 55, 87, 103,
PICNIC_SIGNATURE_SIZE_Picnic_L1_1_UR, Picnic_L1_1_UR, TRANSFORM_UR, NULL_FNS},
{ENABLE_ZKBPP(LOWMC_L3_1_OR_NULL), 48, 24, 329, 329, 3, 24, 24, 107, 3, 83, 0, 0,
PICNIC_SIGNATURE_SIZE_Picnic_L3_1_FS, Picnic_L3_1_FS, TRANSFORM_FS, NULL_FNS},
{ENABLE_ZKBPP(LOWMC_L3_1_OR_NULL), 48, 24, 329, 329, 3, 24, 24, 107, 3, 83, 131, 155,
PICNIC_SIGNATURE_SIZE_Picnic_L3_1_UR, Picnic_L3_1_UR, TRANSFORM_UR, NULL_FNS},
{ENABLE_ZKBPP(LOWMC_L5_1_OR_NULL), 64, 32, 438, 438, 3, 32, 32, 137, 3, 110, 0, 0,
PICNIC_SIGNATURE_SIZE_Picnic_L5_1_FS, Picnic_L5_1_FS, TRANSFORM_FS, NULL_FNS},
{ENABLE_ZKBPP(LOWMC_L5_1_OR_NULL), 64, 32, 438, 438, 3, 32, 32, 137, 3, 110, 169, 201,
PICNIC_SIGNATURE_SIZE_Picnic_L5_1_UR, Picnic_L5_1_UR, TRANSFORM_UR, NULL_FNS}};
};
static bool instance_initialized[PARAMETER_SET_MAX_INDEX];
static bool create_instance(picnic_instance_t* pp) {

View File

@ -22,8 +22,6 @@
#include "picnic.h"
#define SALT_SIZE 32
#define MAX_LOWMC_ROUNDS 38
#define MAX_LOWMC_SBOXES 10
#define MAX_DIGEST_SIZE 64
#define MAX_NUM_ROUNDS 438
#define MAX_VIEW_SIZE 143

View File

@ -0,0 +1,159 @@
/*
* This file is part of the optimized implementation of the Picnic signature scheme.
* See the accompanying documentation for complete details.
*
* The code is provided under the MIT license, see LICENSE for
* more details.
* SPDX-License-Identifier: MIT
*/
#ifndef SHA3_S390_CPACF_H
#define SHA3_S390_CPACF_H
#include <string.h>
#include <stdint.h>
#include "macros.h"
typedef struct {
uint8_t ctx[200]; /* param block */
uint8_t data_block[168]; /* buffer for input and output data */
uint8_t func; /* function: SHAKE128 or SHKAE256 */
uint8_t data_block_size; /* block size */
uint8_t pos; /* current position in data_block */
} hash_context ATTR_ALIGNED(32);
static inline void hash_init(hash_context* ctx, size_t digest_size) {
memset(ctx->ctx, 0, sizeof(ctx->ctx));
memset(ctx->data_block, 0, sizeof(ctx->data_block));
if (digest_size == 32) {
/* SHAKE128 */
ctx->func = 0x24;
ctx->data_block_size = 168;
} else {
/* SHAKE256 */
ctx->func = 0x25;
ctx->data_block_size = 136;
}
ctx->pos = 0;
}
/**
* Perform KIMD instruction (hash multiple blocks of 168 (SHAKE128) or 136 (SHAKE256) bytes
*/
static inline void hash_update_kimd(hash_context* ctx, const uint8_t* data, size_t size) {
/* function code in GR 0 */
register long func asm("0") = ctx->func;
/* param block in GR 1 */
register uint8_t* param asm("1") = ctx->ctx;
/* input data in an even numbered GR (goes into R2) */
register const uint8_t* src asm("2") = data;
/* size of input data (goes into R2+1); needs to be a multiple of the data block size */
register size_t src_size asm("3") = size;
asm volatile("0: .insn rre,0xb93e0000,0,%[src]\n\t" /* KIMD opcode */
" brc 1,0b\n\t" /* handle partial completion */
: [src] "+a"(src), "+d"(src_size)
: "d"(func), "a"(param)
: "cc", "memory");
}
/**
* Perform KLMD instruction (hash and pad the last block of < 168 (SHAKE128) or 136 (SHAKE256) bytes
* and produce XOF output of a block size)
*/
static inline void hash_update_klmd(hash_context* ctx, uint8_t* buffer, size_t buffer_size, const uint8_t* data, size_t size) {
/* function code in GR 0 */
register long func asm("0") = ctx->func;
/* param block in GR 1 */
register uint8_t* param asm("1") = ctx->ctx;
/* input data in an even numbered GR (goes into R2) */
register const uint8_t* src asm("2") = data;
/* size of input data (goes into R2+1) */
register size_t src_size asm("3") = size;
/* output buffer in an even numbered GR (goes into R1) */
register unsigned char* dst asm("4") = buffer;
/* size of output buffer (goes into R1+1); needs to be a multiple of the data block size */
register long dst_size asm("5") = buffer_size;
asm volatile("0: .insn rre,0xb93f0000,%[dst],%[src]\n\t" /* KLMD opcode */
" brc 1,0b\n\t" /* handle partial completion */
: [src] "+a"(src), "+d"(src_size), [dst] "+a"(dst), "+d"(dst_size)
: "d"(func), "a"(param)
: "cc", "memory");
}
/**
* Perform KLMD instruction (produce XOF output of a block size)
*/
static inline void hash_squeeze_kmld(hash_context* ctx, uint8_t* buffer, size_t size) {
hash_update_klmd(ctx, buffer, size, NULL, 0);
}
static inline void hash_update(hash_context* ctx, const uint8_t* data, size_t size) {
/* process buffered data */
if (ctx->pos) {
const size_t gap = ctx->data_block_size - ctx->pos;
const size_t copy_size = MIN(gap, size);
memcpy(ctx->data_block + ctx->pos, data, copy_size);
ctx->pos += copy_size;
data += copy_size;
size -= copy_size;
if (ctx->pos == ctx->data_block_size) {
hash_update_kimd(ctx, ctx->data_block, ctx->data_block_size);
ctx->pos = 0;
}
}
/* process as many full blocks as possible */
if (size > ctx->data_block_size) {
const size_t copy_size = size - (size % ctx->data_block_size);
hash_update_kimd(ctx, data, copy_size);
data += copy_size;
size -= copy_size;
}
/* buffer remaining data */
if (size) {
memcpy(ctx->data_block, data, size);
ctx->pos = size;
}
}
static inline void hash_final(hash_context* ctx) {
/* process remaining input (if available) */
hash_update_klmd(ctx, ctx->data_block, ctx->data_block_size, ctx->data_block, ctx->pos);
ctx->pos = 0;
}
static inline void hash_squeeze(hash_context* ctx, uint8_t* buffer, size_t buflen) {
/* process buffered output */
if (ctx->pos < ctx->data_block_size) {
const size_t gap = ctx->data_block_size - ctx->pos;
const size_t copy_size = MIN(gap, buflen);
memcpy(buffer, ctx->data_block + ctx->pos, copy_size);
ctx->pos += copy_size;
buffer += copy_size;
buflen -= copy_size;
}
/* either ctx->pos == ctx->data_block_size or buflen == 0 */
/* process as many full blocks as possible */
if (buflen > ctx->data_block_size) {
const size_t copy_size = buflen - (buflen % ctx->data_block_size);
hash_squeeze_kmld(ctx, buffer, buflen);
buffer += copy_size;
buflen -= copy_size;
}
if (buflen) {
hash_squeeze_kmld(ctx, ctx->data_block, ctx->data_block_size);
memcpy(buffer, ctx->data_block, buflen);
ctx->pos = buflen;
}
}
#endif

View File

@ -28,10 +28,14 @@
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
#if defined(BUILTIN_CPU_SUPPORTED)
#if !defined(BUILTIN_CPU_SUPPORTED_BROKEN_BMI2)
#define CPU_SUPPORTS_AVX2 (__builtin_cpu_supports("avx2") && __builtin_cpu_supports("bmi2"))
#else
#define CPU_SUPPORTS_AVX2 (__builtin_cpu_supports("avx2") && cpu_supports(CPU_CAP_BMI2))
#endif
#define CPU_SUPPORTS_POPCNT __builtin_cpu_supports("popcnt")
#else
#define CPU_SUPPORTS_AVX2 cpu_supports(CPU_CAP_AVX2)
#define CPU_SUPPORTS_AVX2 cpu_supports(CPU_CAP_AVX2 | CPU_CAP_BMI2)
#define CPU_SUPPORTS_POPCNT cpu_supports(CPU_CAP_POPCNT)
#endif
#endif
@ -39,6 +43,9 @@
#if defined(__x86_64__) || defined(_M_X64)
// X86-64 CPUs always support SSE2
#define CPU_SUPPORTS_SSE2 1
#if defined(WITH_SSE2) || defined(WITH_AVX2)
#define NO_UINT64_FALLBACK
#endif
#elif defined(__i386__) || defined(_M_IX86)
#if defined(BUILTIN_CPU_SUPPORTED)
#define CPU_SUPPORTS_SSE2 __builtin_cpu_supports("sse2")
@ -51,6 +58,9 @@
#if defined(__aarch64__)
#define CPU_SUPPORTS_NEON 1
#if defined(WITH_NEON)
#define NO_UINT64_FALLBACK
#endif
#elif defined(__arm__)
#define CPU_SUPPRTS_NEON cpu_supports(CPU_CAP_NEON)
#else

View File

@ -114,7 +114,7 @@ OQS_SIG *OQS_SIG_picnic_L1_FS_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic_L1_FS;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 1;
sig->euf_cma = true;
@ -153,7 +153,7 @@ OQS_SIG *OQS_SIG_picnic_L1_UR_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic_L1_UR;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 1;
sig->euf_cma = true;
@ -192,7 +192,7 @@ OQS_SIG *OQS_SIG_picnic_L3_FS_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic_L3_FS;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 3;
sig->euf_cma = true;
@ -231,7 +231,7 @@ OQS_SIG *OQS_SIG_picnic_L3_UR_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic_L3_UR;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 3;
sig->euf_cma = true;
@ -270,7 +270,7 @@ OQS_SIG *OQS_SIG_picnic_L5_FS_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic_L5_FS;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 5;
sig->euf_cma = true;
@ -310,7 +310,7 @@ OQS_SIG *OQS_SIG_picnic_L5_UR_new() {
}
sig->method_name = OQS_SIG_alg_picnic_L5_UR;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 5;
sig->euf_cma = true;
@ -347,7 +347,7 @@ OQS_SIG *OQS_SIG_picnic2_L1_FS_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic2_L1_FS;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 1;
sig->euf_cma = true;
@ -385,7 +385,7 @@ OQS_SIG *OQS_SIG_picnic2_L3_FS_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic2_L3_FS;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 3;
sig->euf_cma = true;
@ -423,7 +423,7 @@ OQS_SIG *OQS_SIG_picnic2_L5_FS_new() {
return NULL;
}
sig->method_name = OQS_SIG_alg_picnic2_L5_FS;
sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";
sig->claimed_nist_level = 5;
sig->euf_cma = true;