Updated picnic to v2.2. (#746)

* Updated picnic to v2.2. * Re-enabled optimizations with clang-9/10 * Integrated commit 9917e3 from Picnic, fixing a bug with 128 bit word loading. * Removed hardcoded aligment macro for picnic. * Remove references to now-unused USE_OPTIMIZATIONS.
2025-11-22 00:09:23 -05:00 · 2020-05-07 15:47:34 -04:00 · 2020-05-07 15:47:34 -04:00 · 216cb1a930
commit 216cb1a930
parent 17c03a1bd2
46 changed files with 562 additions and 348117 deletions
--- a/docs/algorithms/sig_picnic.md
+++ b/docs/algorithms/sig_picnic.md
@ -31,7 +31,7 @@ Implementation
 --------------

 - **Source of implementation:** https://github.com/IAIK/Picnic
- **Implementation version:** https://github.com/IAIK/Picnic/tree/v2.1.2
+- **Implementation version:** https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed (v2.2 + bug fix)
 - **License:** MIT License
 - **Language:** C
 - **Constant-time:** Yes
--- a/src/sig/picnic/CMakeLists.txt
+++ b/src/sig/picnic/CMakeLists.txt
@ -21,11 +21,8 @@ set(SRCS sig_picnic.c
         external/io.c
         external/lowmc.c
         external/lowmc_128_128_20.c
-         external/lowmc_128_128_182.c
-         external/lowmc_192_192_284.c
         external/lowmc_192_192_30.c
         external/lowmc_256_256_38.c
-         external/lowmc_256_256_363.c
         external/mpc_lowmc.c
         external/mzd_additional.c
         external/picnic.c
@ -39,18 +36,9 @@ set(SRCS sig_picnic.c
         external/sha3/KeccakHash.c
         external/sha3/KeccakSpongeWidth1600.c)

-# TODO: The optimized Picnic code, when
-# compiled with clang-9 and clang-10, results
-# in signing and verification failures.
-if(CMAKE_C_COMPILER_ID MATCHES "Clang" OR OQS_PORTABLE_BUILD)
-    set(USE_OPTIMIZATIONS OFF)
-else()
-    set(USE_OPTIMIZATIONS ON)
-endif()
 if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND
   OQS_USE_AVX2_INSTRUCTIONS AND
-   OQS_USE_BMI2_INSTRUCTIONS AND
-   USE_OPTIMIZATIONS)
+   OQS_USE_BMI2_INSTRUCTIONS)
    set(USE_AVX2 ON)
 endif()

@ -81,11 +69,8 @@ target_compile_definitions(picnic PRIVATE PICNIC_STATIC
                                          WITH_LOWMC_192_192_30
                                          WITH_LOWMC_256_256_38
                                          WITH_OPT)
-if(NOT WIN32)
-    target_compile_definitions(picnic PRIVATE HAVE_POSIX_MEMALIGN)
-endif()

-if(OQS_USE_SSE2_INSTRUCTIONS AND USE_OPTIMIZATIONS)
+if(OQS_USE_SSE2_INSTRUCTIONS)
    target_compile_definitions(picnic PRIVATE WITH_SSE2)
    add_compile_options(-msse2)
 endif()
--- a/src/sig/picnic/external/CHANGELOG.md
+++ b/src/sig/picnic/external/CHANGELOG.md
@ -1,7 +1,15 @@
+Version 2.2 -- 2020-04-08
+---------------------------
+
+* Fix Picnic2 implementation on big endian systems
+* Add support for SHA3/SHAKE3 instructions on IBM z.
+* Various small improvements and bug fixes.
+* Remove LowMC instances with m=1.
+
 Version 2.1.2 -- 2019-10-03
 ---------------------------

-* Enable to build with ZKB++- or KKW-based instances only.
+* Add options to build with ZKB++- or KKW-based instances only.
 * Fix ARM NEON optimizations.
 * Slightly reduce heap usage.
 * Remove more unused code.
--- a/src/sig/picnic/external/README.md
+++ b/src/sig/picnic/external/README.md
@ -34,7 +34,7 @@ The cmake based build system supports the following flags:
 * ``WITH_MARCH_NATIVE``: Build with -march=native -mtune=native (if supported).
 * ``WITH_LTO``: Enable link-time optimization (if supported).
 * ``WITH_LOWMC_OPT={OFF,ORKC,OLLE}``: Enable optimized round key computation (ORKC) or optimized linear layer evaluation (OLLE) optimizations.
-* ``WITH_LOWMC_M1``: Enable LowMC instances with 1 Sbox minimizing the signature sizes (only useful if built with ``WITH_ZKBPP`` on).
+* ``WITH_SHA3_IMPL={opt64,avx2,armv8a-neon,s390-cpacf}``: Select SHA3 implementation opt64 (the default, from Keccak code package), avx2 (for AVX2 capable x86-64 systems, from Keccak code package), armv8a-neon (for NEON capable ARM systems, from Keccak code package), s390-cpacf (for IBM z14 and newer systems supporting SHAKE)

 Building on Windows
 -------------------
--- a/src/sig/picnic/external/cpu.c
+++ b/src/sig/picnic/external/cpu.c
@ -10,9 +10,7 @@
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #else
-
 /* If cmake checks were not run, define some known values. */
-
 #if !defined(HAVE_SYS_AUXV_H) && defined(__linux__)
 #define HAVE_SYS_AUXV_H
 #endif
@ -24,7 +22,7 @@

 #include "cpu.h"

-#if !defined(BUILTIN_CPU_SUPPORTED)
+#if !defined(BUILTIN_CPU_SUPPORTED) || defined(BUILTIN_CPU_SUPPORTED_BROKEN_BMI2)
 #if defined(__arm__) && defined(HAVE_SYS_AUXV_H) && defined(HAVE_ASM_HWCAP_H)
 #include <asm/hwcap.h>
 #include <sys/auxv.h>
@ -67,39 +65,18 @@ static unsigned init_caps(void) {

  if (max >= 7) {
    __cpuidex(regs.data, 7, 0);
-    if (regs.ebx & ((1 << 5) | (1 << 8))) {
+    if (regs.ebx & (1 << 5)) {
      caps |= CPU_CAP_AVX2;
    }
+    if (regs.ebx & (1 << 8)) {
+      caps |= CPU_CAP_BMI2;
+    }
  }

  return caps;
 }
 #else
-#if defined(SUPERCOP)
-// SUPERCOP places a cpuid.h on the include search path before the system
-// provided cpuid.h. We hack around that by assuming that cpuid always exists
-// and defining __get_cpuid on our own.
-
-static int __get_cpuid(unsigned int leaf, unsigned int* reax, unsigned int* rebx,
-                       unsigned int* recx, unsigned int* redx) {
-
-  unsigned int eax, ebx, ecx, edx;
-  __asm__("cpuid\n" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "0"(leaf & 0x80000000));
-  if (eax == 0 || eax < leaf) {
-    return 0;
-  }
-
-  __asm__("cpuid\n" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "0"(leaf));
-  *reax = eax;
-  *rebx = ebx;
-  *recx = ecx;
-  *redx = edx;
-
-  return 1;
-}
-#else
 #include <cpuid.h>
-#endif

 static unsigned init_caps(void) {
  unsigned int caps = 0;
@ -115,9 +92,12 @@ static unsigned init_caps(void) {
  }

  if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {
-    if (ebx & ((1 << 5) | (1 << 8))) {
+    if (ebx & (1 << 5)) {
      caps |= CPU_CAP_AVX2;
    }
+    if (ebx & (1 << 8)) {
+      caps |= CPU_CAP_BMI2;
+    }
  }

  return caps;
@ -141,9 +121,6 @@ bool cpu_supports(unsigned int caps) {
    cpu_caps = init_caps();
  }

-  return cpu_caps & caps;
+  return (cpu_caps & caps) == caps;
 }
 #endif
-
-// OQS note: add a dummy definition to avoid empty translation unit (which might occur with -Werror=pedantic)
-typedef int avoid_empty_translation_unit;
--- a/src/sig/picnic/external/cpu.h
+++ b/src/sig/picnic/external/cpu.h
@ -10,21 +10,29 @@
 #ifndef CPU_H
 #define CPU_H

+#include "macros.h"
+
 #if defined(__GNUC__) && !(defined(__APPLE__) && (__clang_major__ <= 8)) &&                        \
-  !defined(__MINGW32__) && !defined(__MINGW64__)
+    !defined(__MINGW32__) && !defined(__MINGW64__)
 #define BUILTIN_CPU_SUPPORTED
 #endif

-#if !defined(BUILTIN_CPU_SUPPORTED)
+#if defined(BUILTIN_CPU_SUPPORTED) && GNUC_CHECK(4, 9) && !GNUC_CHECK(5, 0)
+/* gcc 4.9's __builtin_cpu_support does not support "bmi2" */
+#define BUILTIN_CPU_SUPPORTED_BROKEN_BMI2
+#endif
+
+#if !defined(BUILTIN_CPU_SUPPORTED) || defined(BUILTIN_CPU_SUPPORTED_BROKEN_BMI2)
 #include <stdbool.h>
-#include "oqs_picnic_macros.h"

 /* CPU supports SSE2 */
 #define CPU_CAP_SSE2 0x00000001
 /* CPU supports popcnt */
 #define CPU_CAP_POPCNT 0x00000002
-/* CPU supports AVX2 + BMI2 */
+/* CPU supports AVX2 */
 #define CPU_CAP_AVX2 0x00000004
+/* CPU supports BMI2 */
+#define CPU_CAP_BMI2 0x00000010
 /* CPU supports NEON */
 #define CPU_CAP_NEON 0x00000008

--- a/src/sig/picnic/external/kdf_shake.h
+++ b/src/sig/picnic/external/kdf_shake.h
@ -12,6 +12,13 @@

 #include <stdint.h>

+#include "macros.h"
+#include "endian_compat.h"
+
+#if defined(WITH_SHAKE_S390_CPACF)
+/* use the KIMD/KLMD instructions from CPACF for SHAKE support on S390 */
+#include "sha3/s390_cpacf.h"
+#else
 #if !defined(KeccakP200_excluded)
 #define KeccakP200_excluded
 #endif
@ -25,11 +32,14 @@
 #endif

 #if !defined(SUPERCOP)
+/* use SHAKE implementation in sha3/ */
 #include "sha3/KeccakHash.h"
 #if defined(WITH_KECCAK_X4)
+/* use the Keccakx4 implementation */
 #include "sha3/KeccakHashtimes4.h"
 #endif
 #else
+/* use SUPERCOP implementation */
 #include <libkeccak.a.headers/KeccakHash.h>
 #if defined(WITH_KECCAK_X4)
 /* Keccakx4 is not fully supported by SUPERCOP, so we need to ship it ourselves. */
@ -37,9 +47,6 @@
 #endif
 #endif

-#include "macros.h"
-#include "endian_compat.h"
-
 typedef Keccak_HashInstance hash_context ATTR_ALIGNED(32);

 /**
@ -58,6 +65,15 @@ static inline void hash_update(hash_context* ctx, const uint8_t* data, size_t si
  Keccak_HashUpdate(ctx, data, size << 3);
 }

+static inline void hash_final(hash_context* ctx) {
+  Keccak_HashFinal(ctx, NULL);
+}
+
+static inline void hash_squeeze(hash_context* ctx, uint8_t* buffer, size_t buflen) {
+  Keccak_HashSqueeze(ctx, buffer, buflen << 3);
+}
+#endif
+
 static inline void hash_update_uint16_le(hash_context* ctx, uint16_t data) {
  const uint16_t data_le = htole16(data);
  hash_update(ctx, (const uint8_t*)&data_le, sizeof(data_le));
@ -69,14 +85,6 @@ static inline void hash_init_prefix(hash_context* ctx, size_t digest_size,
  hash_update(ctx, &prefix, sizeof(prefix));
 }

-static inline void hash_final(hash_context* ctx) {
-  Keccak_HashFinal(ctx, NULL);
-}
-
-static inline void hash_squeeze(hash_context* ctx, uint8_t* buffer, size_t buflen) {
-  Keccak_HashSqueeze(ctx, buffer, buflen << 3);
-}
-
 typedef hash_context kdf_shake_t;

 #define kdf_shake_init(ctx, digest_size) hash_init((ctx), (digest_size))
@ -182,4 +190,5 @@ typedef hash_context_x4 kdf_shake_x4_t;
 #define kdf_shake_x4_finalize_key(ctx) hash_final_x4((ctx))
 #define kdf_shake_x4_get_randomness(ctx, dst, count) hash_squeeze_x4((ctx), (dst), (count))
 #define kdf_shake_x4_clear(ctx)
+
 #endif
--- a/src/sig/picnic/external/lowmc.c
+++ b/src/sig/picnic/external/lowmc.c
@ -13,7 +13,6 @@

 #include "io.h"
 #include "lowmc.h"
-
 #include "mzd_additional.h"
 #if defined(WITH_KKW)
 #include "picnic2_impl.h"
@ -50,31 +49,6 @@ static void sbox_layer_10_uint64(uint64_t* d) {
  *d = sbox_layer_10_bitsliced_uint64(*d);
 }

-#if defined(WITH_LOWMC_M1)
-static uint64_t sbox_layer_1_bitsliced_uint64(uint64_t in) {
-  // a, b, c
-  const uint64_t x0s = (in & MASK_X0I_1) << 2;
-  const uint64_t x1s = (in & MASK_X1I_1) << 1;
-  const uint64_t x2m = in & MASK_X2I_1;
-
-  // (b & c) ^ a
-  const uint64_t t0 = (x1s & x2m) ^ x0s;
-  // (c & a) ^ a ^ b
-  const uint64_t t1 = (x0s & x2m) ^ x0s ^ x1s;
-  // (a & b) ^ a ^ b ^c
-  const uint64_t t2 = (x0s & x1s) ^ x0s ^ x1s ^ x2m;
-
-  return (in & MASK_MASK_1) ^ (t0 >> 2) ^ (t1 >> 1) ^ t2;
-}
-
-/**
- * S-box for m = 1
- */
-static void sbox_layer_1_uint64(uint64_t* d) {
-  *d = sbox_layer_1_bitsliced_uint64(*d);
-}
-#endif
-
 #if defined(WITH_LOWMC_128_128_20)
 #include "lowmc_128_128_20.h"
 #endif
@ -84,16 +58,8 @@ static void sbox_layer_1_uint64(uint64_t* d) {
 #if defined(WITH_LOWMC_256_256_38)
 #include "lowmc_256_256_38.h"
 #endif
-#if defined(WITH_LOWMC_128_128_182)
-#include "lowmc_128_128_182.h"
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-#include "lowmc_192_192_284.h"
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-#include "lowmc_256_256_363.h"
-#endif

+#if !defined(NO_UINT64_FALLBACK)
 // uint64 based implementation
 #include "lowmc_fns_uint64_L1.h"
 #define LOWMC lowmc_uint64_128
@ -108,6 +74,7 @@ static void sbox_layer_1_uint64(uint64_t* d) {
 #undef LOWMC
 #define LOWMC lowmc_uint64_256
 #include "lowmc.c.i"
+#endif

 #if defined(WITH_OPT)
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -163,11 +130,7 @@ static void sbox_layer_1_uint64(uint64_t* d) {
 #endif

 lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
-#if defined(WITH_LOWMC_M1)
-  ASSUME(lowmc->m == 10 || lowmc->m == 1);
-#else
  ASSUME(lowmc->m == 10);
-#endif
  ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);

 #if defined(WITH_OPT)
@ -189,24 +152,6 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return lowmc_s256_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return lowmc_s256_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return lowmc_s256_256_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -227,28 +172,11 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return lowmc_s128_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return lowmc_s128_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return lowmc_s128_256_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #endif

+#if !defined(NO_UINT64_FALLBACK)
  if (lowmc->m == 10) {
    switch (lowmc->n) {
 #if defined(WITH_LOWMC_128_128_20)
@ -266,23 +194,6 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {
    }
  }

-#if defined(WITH_LOWMC_M1)
-  if (lowmc->m == 1) {
-    switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-    case 128:
-      return lowmc_uint64_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-    case 192:
-      return lowmc_uint64_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-    case 256:
-      return lowmc_uint64_256_1;
-#endif
-    }
-  }
 #endif

  return NULL;
@ -290,11 +201,7 @@ lowmc_implementation_f lowmc_get_implementation(const lowmc_t* lowmc) {

 #if defined(WITH_ZKBPP)
 lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc) {
-#if defined(WITH_LOWMC_M1)
-  ASSUME(lowmc->m == 10 || lowmc->m == 1);
-#else
  ASSUME(lowmc->m == 10);
-#endif
  ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);

 #if defined(WITH_OPT)
@ -316,24 +223,6 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return lowmc_s256_128_store_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return lowmc_s256_192_store_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return lowmc_s256_256_store_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -354,28 +243,11 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return lowmc_s128_128_store_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return lowmc_s128_192_store_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return lowmc_s128_256_store_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #endif

+#if !defined(NO_UINT64_FALLBACK)
  if (lowmc->m == 10) {
    switch (lowmc->n) {
 #if defined(WITH_LOWMC_128_128_20)
@ -393,23 +265,6 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc
    }
  }

-#if defined(WITH_LOWMC_M1)
-  if (lowmc->m == 1) {
-    switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-    case 128:
-      return lowmc_uint64_128_store_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-    case 192:
-      return lowmc_uint64_192_store_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-    case 256:
-      return lowmc_uint64_256_store_1;
-#endif
-    }
-  }
 #endif

  return NULL;
@ -418,11 +273,7 @@ lowmc_store_implementation_f lowmc_store_get_implementation(const lowmc_t* lowmc

 #if defined(WITH_KKW)
 lowmc_compute_aux_implementation_f lowmc_compute_aux_get_implementation(const lowmc_t* lowmc) {
-#if defined(WITH_LOWMC_M1)
-  ASSUME(lowmc->m == 10 || lowmc->m == 1);
-#else
  ASSUME(lowmc->m == 10);
-#endif
  ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);

 #if defined(WITH_OPT)
@ -468,6 +319,7 @@ lowmc_compute_aux_implementation_f lowmc_compute_aux_get_implementation(const lo
 #endif
 #endif

+#if !defined(NO_UINT64_FALLBACK)
  if (lowmc->m == 10) {
    switch (lowmc->n) {
 #if defined(WITH_LOWMC_128_128_20)
@ -484,6 +336,7 @@ lowmc_compute_aux_implementation_f lowmc_compute_aux_get_implementation(const lo
 #endif
    }
  }
+#endif

  return NULL;
 }
--- a/src/sig/picnic/external/lowmc.c.i
+++ b/src/sig/picnic/external/lowmc.c.i
@ -7,18 +7,10 @@
 *  SPDX-License-Identifier: MIT
 */

-#if defined(LOWMC_INSTANCE_10)
-#define LOWMC_INSTANCE LOWMC_INSTANCE_10
+#if defined(LOWMC_INSTANCE)
 #define LOWMC_M 10
-#define LOWMC_R LOWMC_R_10
-#define MUL_MC MUL_MC_10
-#define ADDMUL_R ADDMUL_R_10
-#define MUL_Z MUL_Z_10
-#define MZD_SHUFFLE CONCAT(SHUFFLE, 30)
-#define M_FIXED_10
 #define N_LOWMC CONCAT(LOWMC, 10)
 #define SBOX(x) sbox_layer_10_uint64(&BLOCK(x, 0)->w64[(LOWMC_N / (sizeof(word) * 8)) - 1])
-#define XOR_MC XOR_MC_10
 #include "lowmc_impl.c.i"

 #if defined(WITH_ZKBPP)
@ -39,55 +31,11 @@
 #include "lowmc_impl.c.i"
 #endif

-#undef LOWMC_INSTANCE
 #undef LOWMC_M
-#undef LOWMC_R
-#undef MUL_MC
-#undef ADDMUL_R
-#undef MUL_Z
-#undef MZD_SHUFFLE
-#undef M_FIXED_10
 #undef N_LOWMC
 #undef RECORD_STATE
 #undef PICNIC2_AUX_COMPUTATION
 #undef SBOX
-#undef XOR_MC
-#endif
-
-#if defined(WITH_LOWMC_M1) && defined(LOWMC_INSTANCE_1)
-#define LOWMC_INSTANCE LOWMC_INSTANCE_1
-#define LOWMC_M 1
-#define LOWMC_R LOWMC_R_1
-#define MUL_MC MUL_MC_1
-#define ADDMUL_R ADDMUL_R_1
-#define MUL_Z MUL_Z_1
-#define MZD_SHUFFLE CONCAT(SHUFFLE, 3)
-#define M_FIXED_1
-#define N_LOWMC CONCAT(LOWMC, 1)
-#define SBOX(x) sbox_layer_1_uint64(&BLOCK(x, 0)->w64[(LOWMC_N / (sizeof(word) * 8)) - 1])
-#define XOR_MC XOR_MC_1
-#include "lowmc_impl.c.i"
-
-#if defined(WITH_ZKBPP)
-#undef N_LOWMC
-#define N_LOWMC CONCAT(LOWMC, store_1)
-#define RECORD_STATE
-#include "lowmc_impl.c.i"
-#endif
-
-#undef LOWMC_INSTANCE
-#undef LOWMC_M
-#undef LOWMC_R
-#undef MUL_MC
-#undef ADDMUL_R
-#undef MUL_Z
-#undef MZD_SHUFFLE
-#undef M_FIXED_1
-#undef N_LOWMC
-#undef RECORD_STATE
-#undef PICNIC2_AUX_COMPUTATION
-#undef SBOX
-#undef XOR_MC
 #endif

 // vim: ft=c
--- a/src/sig/picnic/external/lowmc_128_128_182.c
+++ b/src/sig/picnic/external/lowmc_128_128_182.c
--- a/src/sig/picnic/external/lowmc_128_128_182.h
+++ b/src/sig/picnic/external/lowmc_128_128_182.h
@ -1,8 +0,0 @@
-#ifndef LOWMC_128_128_182_H
-#define LOWMC_128_128_182_H
-
-#include "lowmc_pars.h"
-
-extern const lowmc_t lowmc_128_128_182;
-
-#endif
--- a/src/sig/picnic/external/lowmc_192_192_284.c
+++ b/src/sig/picnic/external/lowmc_192_192_284.c
--- a/src/sig/picnic/external/lowmc_192_192_284.h
+++ b/src/sig/picnic/external/lowmc_192_192_284.h
@ -1,8 +0,0 @@
-#ifndef LOWMC_192_192_284_H
-#define LOWMC_192_192_284_H
-
-#include "lowmc_pars.h"
-
-extern const lowmc_t lowmc_192_192_284;
-
-#endif
--- a/src/sig/picnic/external/lowmc_256_256_363.c
+++ b/src/sig/picnic/external/lowmc_256_256_363.c
--- a/src/sig/picnic/external/lowmc_256_256_363.h
+++ b/src/sig/picnic/external/lowmc_256_256_363.h
@ -1,8 +0,0 @@
-#ifndef LOWMC_256_256_363_H
-#define LOWMC_256_256_363_H
-
-#include "lowmc_pars.h"
-
-extern const lowmc_t lowmc_256_256_363;
-
-#endif
--- a/src/sig/picnic/external/lowmc_fns_s128_L1.h
+++ b/src/sig/picnic/external/lowmc_fns_s128_L1.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_s128_128
 #define MUL mzd_mul_v_s128_128
-#define SHUFFLE mzd_shuffle_128
+#define SHUFFLE mzd_shuffle_128_30
 #define XOR mzd_xor_s128_128
 #define COPY mzd_copy_s128_128

-#define MUL_MC_1 mzd_mul_v_s128_128_640
-#define MUL_MC_10 mzd_mul_v_s128_128_640
-#define ADDMUL_R_1 mzd_addmul_v_s128_3_128
-#define ADDMUL_R_10 mzd_addmul_v_s128_30_128
-#define MUL_Z_1 mzd_mul_v_parity_uint64_128_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_128_30
-#define XOR_MC_1 mzd_xor_s128_640
-#define XOR_MC_10 mzd_xor_s128_640
+#define MUL_MC mzd_mul_v_s128_128_640
+#define ADDMUL_R mzd_addmul_v_s128_30_128
+#define MUL_Z mzd_mul_v_parity_uint64_128_30
+#define XOR_MC mzd_xor_s128_640

 #if defined(WITH_LOWMC_128_128_20)
-#define LOWMC_INSTANCE_10 lowmc_128_128_20
-#endif
-#if defined(WITH_LOWMC_128_128_182)
-#define LOWMC_INSTANCE_1 lowmc_128_128_182
+#define LOWMC_INSTANCE lowmc_128_128_20
 #endif
 #define LOWMC_N LOWMC_L1_N
-#define LOWMC_R_10 LOWMC_L1_R
-#define LOWMC_R_1 LOWMC_L1_1_R
+#define LOWMC_R LOWMC_L1_R
--- a/src/sig/picnic/external/lowmc_fns_s128_L3.h
+++ b/src/sig/picnic/external/lowmc_fns_s128_L3.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_s128_192
 #define MUL mzd_mul_v_s128_192
-#define SHUFFLE mzd_shuffle_192
+#define SHUFFLE mzd_shuffle_192_30
 #define XOR mzd_xor_s128_256
 #define COPY mzd_copy_s128_256

-#define MUL_MC_1 mzd_mul_v_s128_192_896
-#define MUL_MC_10 mzd_mul_v_s128_192_1024
-#define ADDMUL_R_1 mzd_addmul_v_s128_3_192
-#define ADDMUL_R_10 mzd_addmul_v_s128_30_192
-#define MUL_Z_1 mzd_mul_v_parity_uint64_192_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_192_30
-#define XOR_MC_1 mzd_xor_s128_896
-#define XOR_MC_10 mzd_xor_s128_1024
+#define MUL_MC mzd_mul_v_s128_192_1024
+#define ADDMUL_R mzd_addmul_v_s128_30_192
+#define MUL_Z mzd_mul_v_parity_uint64_192_30
+#define XOR_MC mzd_xor_s128_1024

 #if defined(WITH_LOWMC_192_192_30)
-#define LOWMC_INSTANCE_10 lowmc_192_192_30
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-#define LOWMC_INSTANCE_1 lowmc_192_192_284
+#define LOWMC_INSTANCE lowmc_192_192_30
 #endif
 #define LOWMC_N LOWMC_L3_N
-#define LOWMC_R_10 LOWMC_L3_R
-#define LOWMC_R_1 LOWMC_L3_1_R
+#define LOWMC_R LOWMC_L3_R
--- a/src/sig/picnic/external/lowmc_fns_s128_L5.h
+++ b/src/sig/picnic/external/lowmc_fns_s128_L5.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_s128_256
 #define MUL mzd_mul_v_s128_256
-#define SHUFFLE mzd_shuffle_256
+#define SHUFFLE mzd_shuffle_256_30
 #define XOR mzd_xor_s128_256
 #define COPY mzd_copy_s128_256

-#define MUL_MC_1 mzd_mul_v_s128_256_1152
-#define MUL_MC_10 mzd_mul_v_s128_256_1280
-#define ADDMUL_R_1 mzd_addmul_v_s128_3_256
-#define ADDMUL_R_10 mzd_addmul_v_s128_30_256
-#define MUL_Z_1 mzd_mul_v_parity_uint64_256_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_256_30
-#define XOR_MC_1 mzd_xor_s128_1152
-#define XOR_MC_10 mzd_xor_s128_1280
+#define MUL_MC mzd_mul_v_s128_256_1280
+#define ADDMUL_R mzd_addmul_v_s128_30_256
+#define MUL_Z mzd_mul_v_parity_uint64_256_30
+#define XOR_MC mzd_xor_s128_1280

 #if defined(WITH_LOWMC_256_256_38)
-#define LOWMC_INSTANCE_10 lowmc_256_256_38
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-#define LOWMC_INSTANCE_1 lowmc_256_256_363
+#define LOWMC_INSTANCE lowmc_256_256_38
 #endif
 #define LOWMC_N LOWMC_L5_N
-#define LOWMC_R_10 LOWMC_L5_R
-#define LOWMC_R_1 LOWMC_L5_1_R
+#define LOWMC_R LOWMC_L5_R
--- a/src/sig/picnic/external/lowmc_fns_s256_L1.h
+++ b/src/sig/picnic/external/lowmc_fns_s256_L1.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_s256_128
 #define MUL mzd_mul_v_s256_128
-#define SHUFFLE mzd_shuffle_pext_128
+#define SHUFFLE mzd_shuffle_pext_128_30
 #define XOR mzd_xor_s256_128
 #define COPY mzd_copy_s256_128

-#define MUL_MC_1 mzd_mul_v_s256_128_768
-#define MUL_MC_10 mzd_mul_v_s256_128_768
-#define ADDMUL_R_1 mzd_addmul_v_s256_3_128
-#define ADDMUL_R_10 mzd_addmul_v_s256_30_128
-#define MUL_Z_1 mzd_mul_v_parity_uint64_128_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_128_30
-#define XOR_MC_1 mzd_xor_s256_768
-#define XOR_MC_10 mzd_xor_s256_768
+#define MUL_MC mzd_mul_v_s256_128_768
+#define ADDMUL_R mzd_addmul_v_s256_30_128
+#define MUL_Z mzd_mul_v_parity_uint64_128_30
+#define XOR_MC mzd_xor_s256_768

 #if defined(WITH_LOWMC_128_128_20)
-#define LOWMC_INSTANCE_10 lowmc_128_128_20
-#endif
-#if defined(WITH_LOWMC_128_128_182)
-#define LOWMC_INSTANCE_1 lowmc_128_128_182
+#define LOWMC_INSTANCE lowmc_128_128_20
 #endif
 #define LOWMC_N LOWMC_L1_N
-#define LOWMC_R_10 LOWMC_L1_R
-#define LOWMC_R_1 LOWMC_L1_1_R
+#define LOWMC_R LOWMC_L1_R
--- a/src/sig/picnic/external/lowmc_fns_s256_L3.h
+++ b/src/sig/picnic/external/lowmc_fns_s256_L3.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_s256_192
 #define MUL mzd_mul_v_s256_192
-#define SHUFFLE mzd_shuffle_pext_192
+#define SHUFFLE mzd_shuffle_pext_192_30
 #define XOR mzd_xor_s256_256
 #define COPY mzd_copy_s256_256

-#define MUL_MC_1 mzd_mul_v_s256_192_1024
-#define MUL_MC_10 mzd_mul_v_s256_192_1024
-#define ADDMUL_R_1 mzd_addmul_v_s256_3_192
-#define ADDMUL_R_10 mzd_addmul_v_s256_30_192
-#define MUL_Z_1 mzd_mul_v_parity_uint64_192_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_192_30
-#define XOR_MC_1 mzd_xor_s256_1024
-#define XOR_MC_10 mzd_xor_s256_1024
+#define MUL_MC mzd_mul_v_s256_192_1024
+#define ADDMUL_R mzd_addmul_v_s256_30_192
+#define MUL_Z mzd_mul_v_parity_uint64_192_30
+#define XOR_MC mzd_xor_s256_1024

 #if defined(WITH_LOWMC_192_192_30)
-#define LOWMC_INSTANCE_10 lowmc_192_192_30
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-#define LOWMC_INSTANCE_1 lowmc_192_192_284
+#define LOWMC_INSTANCE lowmc_192_192_30
 #endif
 #define LOWMC_N LOWMC_L3_N
-#define LOWMC_R_10 LOWMC_L3_R
-#define LOWMC_R_1 LOWMC_L3_1_R
+#define LOWMC_R LOWMC_L3_R
--- a/src/sig/picnic/external/lowmc_fns_s256_L5.h
+++ b/src/sig/picnic/external/lowmc_fns_s256_L5.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_s256_256
 #define MUL mzd_mul_v_s256_256
-#define SHUFFLE mzd_shuffle_pext_256
+#define SHUFFLE mzd_shuffle_pext_256_30
 #define XOR mzd_xor_s256_256
 #define COPY mzd_copy_s256_256

-#define MUL_MC_1 mzd_mul_v_s256_256_1280
-#define MUL_MC_10 mzd_mul_v_s256_256_1280
-#define ADDMUL_R_1 mzd_addmul_v_s256_3_256
-#define ADDMUL_R_10 mzd_addmul_v_s256_30_256
-#define MUL_Z_1 mzd_mul_v_parity_uint64_256_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_256_30
-#define XOR_MC_1 mzd_xor_s256_1280
-#define XOR_MC_10 mzd_xor_s256_1280
+#define MUL_MC mzd_mul_v_s256_256_1280
+#define ADDMUL_R mzd_addmul_v_s256_30_256
+#define MUL_Z mzd_mul_v_parity_uint64_256_30
+#define XOR_MC mzd_xor_s256_1280

 #if defined(WITH_LOWMC_256_256_38)
-#define LOWMC_INSTANCE_10 lowmc_256_256_38
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-#define LOWMC_INSTANCE_1 lowmc_256_256_363
+#define LOWMC_INSTANCE lowmc_256_256_38
 #endif
 #define LOWMC_N LOWMC_L5_N
-#define LOWMC_R_10 LOWMC_L5_R
-#define LOWMC_R_1 LOWMC_L5_1_R
+#define LOWMC_R LOWMC_L5_R
--- a/src/sig/picnic/external/lowmc_fns_uint64_L1.h
+++ b/src/sig/picnic/external/lowmc_fns_uint64_L1.h
@ -12,24 +12,16 @@
 #define ADDMUL mzd_addmul_v_uint64_128
 #define MUL mzd_mul_v_uint64_128
 #define XOR mzd_xor_uint64_128
-#define SHUFFLE mzd_shuffle_128
+#define SHUFFLE mzd_shuffle_128_30
 #define COPY mzd_copy_uint64_128

-#define MUL_MC_1 mzd_mul_v_uint64_128_576
-#define MUL_MC_10 mzd_mul_v_uint64_128_640
-#define ADDMUL_R_1 mzd_addmul_v_uint64_3_128
-#define ADDMUL_R_10 mzd_addmul_v_uint64_30_128
-#define MUL_Z_1 mzd_mul_v_parity_uint64_128_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_128_30
-#define XOR_MC_1 mzd_xor_uint64_576
-#define XOR_MC_10 mzd_xor_uint64_640
+#define MUL_MC mzd_mul_v_uint64_128_640
+#define ADDMUL_R mzd_addmul_v_uint64_30_128
+#define MUL_Z mzd_mul_v_parity_uint64_128_30
+#define XOR_MC mzd_xor_uint64_640

-#define LOWMC_N LOWMC_L1_N
-#define LOWMC_R_10 LOWMC_L1_R
-#define LOWMC_R_1 LOWMC_L1_1_R
 #if defined(WITH_LOWMC_128_128_20)
-#define LOWMC_INSTANCE_10 lowmc_128_128_20
-#endif
-#if defined(WITH_LOWMC_128_128_182)
-#define LOWMC_INSTANCE_1 lowmc_128_128_182
+#define LOWMC_INSTANCE lowmc_128_128_20
 #endif
+#define LOWMC_N LOWMC_L1_N
+#define LOWMC_R LOWMC_L1_R
--- a/src/sig/picnic/external/lowmc_fns_uint64_L3.h
+++ b/src/sig/picnic/external/lowmc_fns_uint64_L3.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_uint64_192
 #define MUL mzd_mul_v_uint64_192
-#define SHUFFLE mzd_shuffle_192
+#define SHUFFLE mzd_shuffle_192_30
 #define XOR mzd_xor_uint64_192
 #define COPY mzd_copy_uint64_192

-#define MUL_MC_1 mzd_mul_v_uint64_192_896
-#define MUL_MC_10 mzd_mul_v_uint64_192_960
-#define ADDMUL_R_1 mzd_addmul_v_uint64_3_192
-#define ADDMUL_R_10 mzd_addmul_v_uint64_30_192
-#define MUL_Z_1 mzd_mul_v_parity_uint64_192_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_192_30
-#define XOR_MC_1 mzd_xor_uint64_896
-#define XOR_MC_10 mzd_xor_uint64_960
+#define MUL_MC mzd_mul_v_uint64_192_960
+#define ADDMUL_R mzd_addmul_v_uint64_30_192
+#define MUL_Z mzd_mul_v_parity_uint64_192_30
+#define XOR_MC mzd_xor_uint64_960

-#define LOWMC_N LOWMC_L3_N
-#define LOWMC_R_10 LOWMC_L3_R
-#define LOWMC_R_1 LOWMC_L3_1_R
 #if defined(WITH_LOWMC_192_192_30)
-#define LOWMC_INSTANCE_10 lowmc_192_192_30
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-#define LOWMC_INSTANCE_1 lowmc_192_192_284
+#define LOWMC_INSTANCE lowmc_192_192_30
 #endif
+#define LOWMC_N LOWMC_L3_N
+#define LOWMC_R LOWMC_L3_R
--- a/src/sig/picnic/external/lowmc_fns_uint64_L5.h
+++ b/src/sig/picnic/external/lowmc_fns_uint64_L5.h
@ -11,25 +11,17 @@

 #define ADDMUL mzd_addmul_v_uint64_256
 #define MUL mzd_mul_v_uint64_256
-#define SHUFFLE mzd_shuffle_256
+#define SHUFFLE mzd_shuffle_256_30
 #define XOR mzd_xor_uint64_256
 #define COPY mzd_copy_uint64_256

-#define MUL_MC_1 mzd_mul_v_uint64_256_1152
-#define MUL_MC_10 mzd_mul_v_uint64_256_1216
-#define ADDMUL_R_1 mzd_addmul_v_uint64_3_256
-#define ADDMUL_R_10 mzd_addmul_v_uint64_30_256
-#define MUL_Z_1 mzd_mul_v_parity_uint64_256_3
-#define MUL_Z_10 mzd_mul_v_parity_uint64_256_30
-#define XOR_MC_1 mzd_xor_uint64_1152
-#define XOR_MC_10 mzd_xor_uint64_1216
+#define MUL_MC mzd_mul_v_uint64_256_1216
+#define ADDMUL_R mzd_addmul_v_uint64_30_256
+#define MUL_Z mzd_mul_v_parity_uint64_256_30
+#define XOR_MC mzd_xor_uint64_1216

-#define LOWMC_N LOWMC_L5_N
-#define LOWMC_R_10 LOWMC_L5_R
-#define LOWMC_R_1 LOWMC_L5_1_R
 #if defined(WITH_LOWMC_256_256_38)
-#define LOWMC_INSTANCE_10 lowmc_256_256_38
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-#define LOWMC_INSTANCE_1 lowmc_256_256_363
+#define LOWMC_INSTANCE lowmc_256_256_38
 #endif
+#define LOWMC_N LOWMC_L5_N
+#define LOWMC_R LOWMC_L5_R
--- a/src/sig/picnic/external/lowmc_fns_undef.h
+++ b/src/sig/picnic/external/lowmc_fns_undef.h
@ -9,19 +9,13 @@

 #undef ADDMUL
 #undef COPY
-#undef LOWMC_INSTANCE_1
-#undef LOWMC_INSTANCE_10
+#undef LOWMC_INSTANCE
 #undef LOWMC_N
-#undef LOWMC_R_1
-#undef LOWMC_R_10
+#undef LOWMC_R
 #undef MUL
-#undef MUL_MC_1
-#undef MUL_MC_10
-#undef ADDMUL_R_1
-#undef ADDMUL_R_10
-#undef MUL_Z_1
-#undef MUL_Z_10
+#undef MUL_MC
+#undef ADDMUL_R
+#undef MUL_Z
 #undef SHUFFLE
 #undef XOR
-#undef XOR_MC_1
-#undef XOR_MC_10
+#undef XOR_MC
--- a/src/sig/picnic/external/lowmc_impl.c.i
+++ b/src/sig/picnic/external/lowmc_impl.c.i
@ -7,9 +7,6 @@
 *  SPDX-License-Identifier: MIT
 */

-#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION) && !defined(M_FIXED_1) && !defined(M_FIXED_10)
-#error "OLLE is only implemented for 1 or 10 Sboxes"
-#endif

 #if defined(FN_ATTR)
 FN_ATTR
@ -26,11 +23,7 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
  mzd_local_t x[((LOWMC_N) + 255) / 256];
  mzd_local_t y[((LOWMC_N) + 255) / 256];
 #if defined(REDUCED_ROUND_KEY_COMPUTATION)
-#if defined(M_FIXED_10)
-  mzd_local_t nl_part[(LOWMC_R * 32 + 255) / 256];
-#elif defined(M_FIXED_1)
-  mzd_local_t nl_part[(((LOWMC_R + 20) / 21) * 64 + 255) / 256];
-#endif
+mzd_local_t nl_part[(LOWMC_R * 32 + 255) / 256];

 #if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION) // LOWMC_OPT=OLLE
 #if defined(PICNIC2_AUX_COMPUTATION)
@ -56,27 +49,16 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
    SBOX(x);
 #endif

-#if defined(M_FIXED_10)
    const word nl = CONST_BLOCK(nl_part, i >> 3)->w64[(i & 0x7) >> 1];
    BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
        (nl << (1 - (i & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
-#elif defined(M_FIXED_1)
-    const word nl = CONST_BLOCK(nl_part, i / (4 * 21))->w64[(i % (4 * 21)) / 21];
-    BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
-        (nl << ((20 - (i % 21)) * 3)) & WORD_C(0xE000000000000000);
-#endif

    MUL_Z(y, x, round->z_matrix);
-    MZD_SHUFFLE(x, round->r_mask);
+    SHUFFLE(x, round->r_mask);
    ADDMUL_R(y, x, round->r_matrix);

-#if defined(M_FIXED_10)
    BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &=
        WORD_C(0x00000003FFFFFFFF); // clear nl part
-#elif defined(M_FIXED_1)
-    BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &=
-        WORD_C(0x1FFFFFFFFFFFFFFF); // clear nl part
-#endif
    XOR(x, y, x);
  }
 #if defined(RECORD_STATE)
@ -88,15 +70,9 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
  SBOX(x);

  unsigned int i = (LOWMC_R - 1);
-#if defined(M_FIXED_10)
  const word nl  = CONST_BLOCK(nl_part, i >> 3)->w64[(i & 0x7) >> 1];
  BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
      (nl << (1 - (i & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
-#elif defined(M_FIXED_1)
-  const word nl = CONST_BLOCK(nl_part, i / (4 * 21))->w64[(i % (4 * 21)) / 21];
-  BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
-      (nl << ((20 - (i % 21)) * 3)) & WORD_C(0xE000000000000000);
-#endif
  MUL(y, x, LOWMC_INSTANCE.zr_matrix);
  COPY(x, y);
 #endif
@ -122,15 +98,9 @@ static void N_LOWMC(lowmc_key_t const* lowmc_key, mzd_local_t const* p, mzd_loca
    SBOX(x);
 #endif

-#if defined(M_FIXED_10)
    const word nl = CONST_BLOCK(nl_part, i >> 3)->w64[(i & 0x7) >> 1];
    BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
        (i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
-#elif defined(M_FIXED_1)
-    const word nl = CONST_BLOCK(nl_part, i / (4 * 21))->w64[(i % (4 * 21)) / 21];
-    BLOCK(x, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
-        (nl << ((20 - (i % 21)) * 3)) & WORD_C(0xE000000000000000);
-#endif
    MUL(y, x, round->l_matrix);
    COPY(x, y);
  }
--- a/src/sig/picnic/external/lowmc_pars.h
+++ b/src/sig/picnic/external/lowmc_pars.h
@ -20,6 +20,8 @@ typedef mzd_local_t lowmc_key_t;
 #define MAX_LOWMC_BLOCK_SIZE_BITS (MAX_LOWMC_BLOCK_SIZE * 8)
 #define MAX_LOWMC_KEY_SIZE MAX_LOWMC_BLOCK_SIZE
 #define MAX_LOWMC_KEY_SIZE_BITS (MAX_LOWMC_KEY_SIZE * 8)
+#define MAX_LOWMC_ROUNDS 38
+#define MAX_LOWMC_SBOXES 10

 /**
 * Masks for 10 S-boxes.
@ -29,14 +31,6 @@ typedef mzd_local_t lowmc_key_t;
 #define MASK_X2I UINT64_C(0x9249249000000000)
 #define MASK_MASK UINT64_C(0x00000003ffffffff)

-/**
- * Masks for 1 S-box.
- */
-#define MASK_X0I_1 UINT64_C(0x2000000000000000)
-#define MASK_X1I_1 UINT64_C(0x4000000000000000)
-#define MASK_X2I_1 UINT64_C(0x8000000000000000)
-#define MASK_MASK_1 UINT64_C(0x1fffffffffffffff)
-
 /**
 * LowMC instances
 */
@ -55,21 +49,6 @@ typedef mzd_local_t lowmc_key_t;
 #define LOWMC_L5_K LOWMC_L5_N
 #define LOWMC_L5_R 38

-#define LOWMC_L1_1_N 128
-#define LOWMC_L1_1_M 1
-#define LOWMC_L1_1_K LOWMC_L1_1_N
-#define LOWMC_L1_1_R 182
-
-#define LOWMC_L3_1_N 192
-#define LOWMC_L3_1_M 1
-#define LOWMC_L3_1_K LOWMC_L3_1_N
-#define LOWMC_L3_1_R 284
-
-#define LOWMC_L5_1_N 256
-#define LOWMC_L5_1_M 1
-#define LOWMC_L5_1_K LOWMC_L5_1_N
-#define LOWMC_L5_1_R 363
-
 typedef struct {
 #if !defined(REDUCED_ROUND_KEY_COMPUTATION)
  const mzd_local_t* k_matrix;
--- a/src/sig/picnic/external/macros.h
+++ b/src/sig/picnic/external/macros.h
@ -21,11 +21,11 @@
 #endif

 /* compatibility with clang and other compilers */
-#ifndef __has_attribute
+#if !defined(__has_attribute)
 #define __has_attribute(a) 0
 #endif

-#ifndef __has_builtin
+#if !defined(__has_builtin)
 #define __has_builtin(b) 0
 #endif

@ -83,8 +83,10 @@
 /* note that C11's alignas will only do the job once DR 444 is implemented */
 #if GNUC_CHECK(4, 9) || __has_attribute(aligned)
 #define ATTR_ALIGNED(i) __attribute__((aligned((i))))
+#define HAVE_USEFUL_ATTR_ALIGNED
 /* #elif defined(_MSC_VER)
-#define ATTR_ALIGNED(i) __declspec(align((i))) */
+#define ATTR_ALIGNED(i) __declspec(align((i)))
+#define HAVE_USEFUL_ATTR_ALIGNED */
 #else
 #define ATTR_ALIGNED(i)
 #endif
@ -103,7 +105,7 @@
 /* assume aligned builtin */
 #if GNUC_CHECK(4, 9) || __has_builtin(__builtin_assume_aligned)
 #define ASSUME_ALIGNED(p, a) __builtin_assume_aligned((p), (a))
-#elif defined(UNREACHABLE)
+#elif defined(UNREACHABLE) && defined(HAVE_USEFUL_ATTR_ALIGNED)
 #define ASSUME_ALIGNED(p, a) (((((uintptr_t)(p)) % (a)) == 0) ? (p) : (UNREACHABLE, (p)))
 #else
 #define ASSUME_ALIGNED(p, a) (p)
@ -249,4 +251,10 @@ static inline uint32_t ceil_log2(uint32_t x) {
  return 32 - clz(x - 1);
 }

+#if defined(__WIN32__)
+#define SIZET_FMT "%Iu"
+#else
+#define SIZET_FMT "%zu"
+#endif
+
 #endif
--- a/src/sig/picnic/external/mpc_lowmc.c
+++ b/src/sig/picnic/external/mpc_lowmc.c
@ -141,45 +141,6 @@ static void mpc_and_verify_uint64(uint64_t* res, uint64_t const* first, uint64_t
    }                                                                                              \
  } while (0)

-#define bitsliced_step_1_uint64_1(sc)                                                              \
-  uint64_t r0m[sc];                                                                                \
-  uint64_t r0s[sc];                                                                                \
-  uint64_t r1m[sc];                                                                                \
-  uint64_t r1s[sc];                                                                                \
-  uint64_t r2m[sc];                                                                                \
-  uint64_t x0s[sc];                                                                                \
-  uint64_t x1s[sc];                                                                                \
-  uint64_t x2m[sc];                                                                                \
-  do {                                                                                             \
-    for (unsigned int m = 0; m < (sc); ++m) {                                                      \
-      const uint64_t inm   = in[m];                                                                \
-      const uint64_t rvecm = rvec[m];                                                              \
-                                                                                                   \
-      x0s[m] = (inm & MASK_X0I_1) << 2;                                                            \
-      x1s[m] = (inm & MASK_X1I_1) << 1;                                                            \
-      x2m[m] = inm & MASK_X2I_1;                                                                   \
-                                                                                                   \
-      r0m[m] = rvecm & MASK_X0I_1;                                                                 \
-      r1m[m] = rvecm & MASK_X1I_1;                                                                 \
-      r2m[m] = rvecm & MASK_X2I_1;                                                                 \
-                                                                                                   \
-      r0s[m] = r0m[m] << 2;                                                                        \
-      r1s[m] = r1m[m] << 1;                                                                        \
-    }                                                                                              \
-  } while (0)
-
-#define bitsliced_step_2_uint64_1(sc)                                                              \
-  do {                                                                                             \
-    for (unsigned int m = 0; m < (sc); ++m) {                                                      \
-      const uint64_t tmp1 = r2m[m] ^ x0s[m];                                                       \
-      const uint64_t tmp2 = x0s[m] ^ x1s[m];                                                       \
-      const uint64_t tmp3 = tmp2 ^ r1m[m];                                                         \
-      const uint64_t tmp4 = tmp2 ^ r0m[m] ^ x2m[m];                                                \
-                                                                                                   \
-      in[m] = (in[m] & MASK_MASK_1) ^ (tmp4) ^ (tmp1 >> 2) ^ (tmp3 >> 1);                          \
-    }                                                                                              \
-  } while (0)
-
 static void mpc_sbox_layer_bitsliced_uint64_10(uint64_t* in, view_t* view, uint64_t const* rvec) {
  bitsliced_step_1_uint64_10(SC_PROOF);

@ -201,29 +162,6 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_10(uint64_t* in, view_t* view
  bitsliced_step_2_uint64_10(SC_VERIFY);
 }

-#if defined(WITH_LOWMC_M1)
-static void mpc_sbox_layer_bitsliced_uint64_1(uint64_t* in, view_t* view, uint64_t const* rvec) {
-  bitsliced_step_1_uint64_1(SC_PROOF);
-
-  mpc_and_uint64(r0m, x0s, x1s, r2m, view, 0);
-  mpc_and_uint64(r2m, x1s, x2m, r1s, view, 1);
-  mpc_and_uint64(r1m, x0s, x2m, r0s, view, 2);
-
-  bitsliced_step_2_uint64_1(SC_PROOF - 1);
-}
-
-static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
-                                                     uint64_t const* rvec) {
-  bitsliced_step_1_uint64_1(SC_VERIFY);
-
-  mpc_and_verify_uint64(r0m, x0s, x1s, r2m, view, MASK_X2I_1, 0);
-  mpc_and_verify_uint64(r2m, x1s, x2m, r1s, view, MASK_X2I_1, 1);
-  mpc_and_verify_uint64(r1m, x0s, x2m, r0s, view, MASK_X2I_1, 2);
-
-  bitsliced_step_2_uint64_1(SC_VERIFY);
-}
-#endif
-
 #if defined(WITH_LOWMC_128_128_20)
 #include "lowmc_128_128_20.h"
 #endif
@ -233,15 +171,6 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
 #if defined(WITH_LOWMC_256_256_38)
 #include "lowmc_256_256_38.h"
 #endif
-#if defined(WITH_LOWMC_128_128_182)
-#include "lowmc_128_128_182.h"
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-#include "lowmc_192_192_284.h"
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-#include "lowmc_256_256_363.h"
-#endif

 #define SBOX_uint64(sbox, y, x, views, r, n, shares, shares2)                                      \
  do {                                                                                             \
@ -259,6 +188,7 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,

 #define R_uint64 const uint64_t* r = rvec[i].t

+#if !defined(NO_UINT64_FALLBACK)
 // uint64 based implementation
 #include "lowmc_fns_uint64_L1.h"
 #define SIGN mpc_lowmc_call_uint64_128
@ -274,6 +204,7 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
 #define SIGN mpc_lowmc_call_uint64_256
 #define VERIFY mpc_lowmc_call_verify_uint64_256
 #include "mpc_lowmc.c.i"
+#endif

 #if defined(WITH_OPT)
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -329,11 +260,7 @@ static void mpc_sbox_layer_bitsliced_verify_uint64_1(uint64_t* in, view_t* view,
 #endif

 zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc) {
-#if defined(WITH_LOWMC_M1)
-  ASSUME(lowmc->m == 10 || lowmc->m == 1);
-#else
  ASSUME(lowmc->m == 10);
-#endif
  ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);

 #if defined(WITH_OPT)
@ -355,24 +282,6 @@ zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return mpc_lowmc_call_s256_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return mpc_lowmc_call_s256_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return mpc_lowmc_call_s256_256_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -393,28 +302,11 @@ zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return mpc_lowmc_call_s128_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return mpc_lowmc_call_s128_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return mpc_lowmc_call_s128_256_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #endif

+#if !defined(NO_UINT64_FALLBACK)
  if (lowmc->m == 10) {
    switch (lowmc->n) {
 #if defined(WITH_LOWMC_128_128_20)
@ -432,34 +324,13 @@ zkbpp_lowmc_implementation_f get_zkbpp_lowmc_implementation(const lowmc_t* lowmc
    }
  }

-#if defined(WITH_LOWMC_M1)
-  if (lowmc->m == 1) {
-    switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-    case 128:
-      return mpc_lowmc_call_uint64_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-    case 192:
-      return mpc_lowmc_call_uint64_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-    case 256:
-      return mpc_lowmc_call_uint64_256_1;
-#endif
-    }
-  }
 #endif

  return NULL;
 }

 zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const lowmc_t* lowmc) {
-#if defined(WITH_LOWMC_M1)
-  ASSUME(lowmc->m == 10 || lowmc->m == 1);
-#else
  ASSUME(lowmc->m == 10);
-#endif
  ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);

 #if defined(WITH_OPT)
@ -481,24 +352,6 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return mpc_lowmc_call_verify_s256_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return mpc_lowmc_call_verify_s256_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return mpc_lowmc_call_verify_s256_256_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -519,24 +372,6 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return mpc_lowmc_call_verify_s128_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return mpc_lowmc_call_verify_s128_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return mpc_lowmc_call_verify_s128_256_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #if defined(WITH_NEON)
@ -557,28 +392,11 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
 #endif
      }
    }
-#if defined(WITH_LOWMC_M1)
-    if (lowmc->m == 1) {
-      switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-      case 128:
-        return mpc_lowmc_call_verify_s128_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-      case 192:
-        return mpc_lowmc_call_verify_s128_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-      case 256:
-        return mpc_lowmc_call_verify_s128_256_1;
-#endif
-      }
-    }
-#endif
  }
 #endif
 #endif

+#if !defined(NO_UINT64_FALLBACK)
  if (lowmc->m == 10) {
    switch (lowmc->n) {
 #if defined(WITH_LOWMC_128_128_20)
@ -595,29 +413,12 @@ zkbpp_lowmc_verify_implementation_f get_zkbpp_lowmc_verify_implementation(const
 #endif
    }
  }
-
-#if defined(WITH_LOWMC_M1)
-  if (lowmc->m == 1) {
-    switch (lowmc->n) {
-#if defined(WITH_LOWMC_128_128_182)
-    case 128:
-      return mpc_lowmc_call_verify_uint64_128_1;
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-    case 192:
-      return mpc_lowmc_call_verify_uint64_192_1;
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-    case 256:
-      return mpc_lowmc_call_verify_uint64_256_1;
-#endif
-    }
-  }
 #endif

  return NULL;
 }

+#if !defined(NO_UINT64_FALLBACK)
 static void mzd_share_uint64_128(mzd_local_t* r, const mzd_local_t* v1, const mzd_local_t* v2,
                                 const mzd_local_t* v3) {
  mzd_xor_uint64_128(r, v1, v2);
@ -635,6 +436,7 @@ static void mzd_share_uint64_256(mzd_local_t* r, const mzd_local_t* v1, const mz
  mzd_xor_uint64_256(r, v1, v2);
  mzd_xor_uint64_256(r, r, v3);
 }
+#endif

 #if defined(WITH_OPT)
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -702,6 +504,7 @@ zkbpp_share_implementation_f get_zkbpp_share_implentation(const lowmc_t* lowmc)
 #endif
 #endif

+#if !defined(NO_UINT64_FALLBACK)
  switch (lowmc->n) {
  case 128:
    return mzd_share_uint64_128;
@ -710,4 +513,5 @@ zkbpp_share_implementation_f get_zkbpp_share_implentation(const lowmc_t* lowmc)
  default:
    return mzd_share_uint64_256;
  }
+#endif
 }
--- a/src/sig/picnic/external/mpc_lowmc.c.i
+++ b/src/sig/picnic/external/mpc_lowmc.c.i
@ -7,50 +7,14 @@
 *  SPDX-License-Identifier: MIT
 */

-#if defined(LOWMC_INSTANCE_10)
-#define M_FIXED_10
+#if defined(LOWMC_INSTANCE)
 #define N_SIGN CONCAT(SIGN, 10)
 #define N_VERIFY CONCAT(VERIFY, 10)
-#define MZD_SHUFFLE CONCAT(SHUFFLE, 30)
-#define ADDMUL_R ADDMUL_R_10
-#define MUL_Z MUL_Z_10
-#define XOR_MC XOR_MC_10
-#define MUL_MC MUL_MC_10
-#define LOWMC_R LOWMC_R_10
-#define LOWMC_INSTANCE LOWMC_INSTANCE_10
 #include "mpc_lowmc_impl.c.i"
-#undef ADDMUL_R
-#undef MUL_Z
-#undef LOWMC_R
-#undef LOWMC_INSTANCE
-#undef M_FIXED_10
-#undef MZD_SHUFFLE
-#undef XOR_MC
-#undef MUL_MC
-#endif
-
-#if defined(WITH_LOWMC_M1) && defined(LOWMC_INSTANCE_1)
-#define M_FIXED_1
-#define N_SIGN CONCAT(SIGN, 1)
-#define N_VERIFY CONCAT(VERIFY, 1)
-#define MZD_SHUFFLE CONCAT(SHUFFLE, 3)
-#define ADDMUL_R ADDMUL_R_1
-#define MUL_Z MUL_Z_1
-#define XOR_MC XOR_MC_1
-#define MUL_MC MUL_MC_1
-#define LOWMC_R LOWMC_R_1
-#define LOWMC_INSTANCE LOWMC_INSTANCE_1
-#include "mpc_lowmc_impl.c.i"
-#undef ADDMUL_R
-#undef MUL_Z
-#undef LOWMC_R
-#undef LOWMC_INSTANCE
-#undef M_FIXED_1
-#undef MZD_SHUFFLE
-#undef XOR_MC
-#undef MUL_MC
 #endif

+#undef N_SIGN
+#undef N_VERIFY
 #undef SIGN
 #undef VERIFY

--- a/src/sig/picnic/external/mpc_lowmc_impl.c.i
+++ b/src/sig/picnic/external/mpc_lowmc_impl.c.i
@ -10,27 +10,17 @@
 #define RANDTAPE R_uint64
 #define SBOX SBOX_uint64

-#if defined(M_FIXED_10)
+#define LOWMC_M 10
 #undef SBOX_SIGN
 #undef SBOX_VERIFY
-
-#define LOWMC_M 10
 #define SBOX_SIGN mpc_sbox_layer_bitsliced_uint64_10
 #define SBOX_VERIFY mpc_sbox_layer_bitsliced_verify_uint64_10
-#elif defined(M_FIXED_1)
-#define LOWMC_M 1
-#undef SBOX_SIGN
-#undef SBOX_VERIFY
-
-#define SBOX_SIGN mpc_sbox_layer_bitsliced_uint64_1
-#define SBOX_VERIFY mpc_sbox_layer_bitsliced_verify_uint64_1
-#endif

 #if defined(FN_ATTR)
 FN_ATTR
 #endif
 static void N_SIGN(mzd_local_t const* p, view_t* views, in_out_shares_t* in_out_shares,
-		   rvec_t* rvec, recorded_state_t* recorded_state) {
+                   rvec_t* rvec, recorded_state_t* recorded_state) {
 #define reduced_shares (SC_PROOF - 1)
 #define MPC_LOOP_CONST_C(function, result, first, second, sc, c)                                   \
  MPC_LOOP_CONST_C_0(function, result, first, second, sc)
--- a/src/sig/picnic/external/mpc_lowmc_loop.c.i
+++ b/src/sig/picnic/external/mpc_lowmc_loop.c.i
@ -7,17 +7,9 @@
 *  SPDX-License-Identifier: MIT
 */

-#if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION) && !defined(M_FIXED_1) && !defined(M_FIXED_10)
-#error "OLLE is only implemented for 1 or 10 Sboxes"
-#endif
-
 lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
 #if defined(REDUCED_ROUND_KEY_COMPUTATION)
-#if defined(M_FIXED_10)
  mzd_local_t nl_part[reduced_shares][(LOWMC_R * 32 + 255) / 256];
-#elif defined(M_FIXED_1)
-  mzd_local_t nl_part[reduced_shares][(((LOWMC_R + 20) / 21) * 64 + 255) / 256];
-#endif
 #if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
  MPC_LOOP_CONST_C(XOR, x, x, LOWMC_INSTANCE.precomputed_constant_linear, reduced_shares, ch);
  MPC_LOOP_CONST(MUL_MC, nl_part, lowmc_key,
@ -30,28 +22,19 @@ lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
 #endif
    SBOX(sbox, y, x, views, r, LOWMC_N, shares, reduced_shares);
    for (unsigned int k = 0; k < reduced_shares; ++k) {
-#if defined(M_FIXED_10)
      const word nl = CONST_BLOCK(nl_part[k], i >> 3)->w64[(i & 0x7) >> 1];
      BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
        (i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
-#elif defined(M_FIXED_1)
-      const word nl = CONST_BLOCK(nl_part[k], i / (4 * 21))->w64[(i % (4 * 21)) / 21];
-      BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^= (nl << ((20-(i%21))*3)) & WORD_C(0xE000000000000000);
-#endif
    }
    MPC_LOOP_CONST(MUL_Z, x, y, round->z_matrix, reduced_shares);

    for(unsigned int k = 0; k < reduced_shares; ++k) {
-      MZD_SHUFFLE(y[k], round->r_mask);
+      SHUFFLE(y[k], round->r_mask);
    }

    MPC_LOOP_CONST(ADDMUL_R, x, y, round->r_matrix, reduced_shares);
    for(unsigned int k = 0; k < reduced_shares; ++k) {
-#if defined(M_FIXED_10)
      BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &= WORD_C(0x00000003FFFFFFFF); //clear nl part
-#elif defined(M_FIXED_1)
-      BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] &= WORD_C(0x1FFFFFFFFFFFFFFF); //clear nl part
-#endif
    }
    MPC_LOOP_SHARED(XOR, x, x, y, reduced_shares);
  }
@ -63,14 +46,9 @@ lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
  SBOX(sbox, y, x, views, r, LOWMC_N, shares, reduced_shares);

  for (unsigned int k = 0; k < reduced_shares; ++k) {
-#if defined(M_FIXED_10)
    const word nl = CONST_BLOCK(nl_part[k], i >> 3)->w64[(i & 0x7) >> 1];
    BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
      (i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
-#elif defined(M_FIXED_1)
-    const word nl = CONST_BLOCK(nl_part[k], i / (4 * 21))->w64[(i % (4 * 21)) / 21];
-    BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^= (nl << ((20-(i%21))*3)) & WORD_C(0xE000000000000000);
-#endif
  }
  MPC_LOOP_CONST(MUL, x, y, LOWMC_INSTANCE.zr_matrix, reduced_shares);
 #else
@ -85,14 +63,9 @@ lowmc_round_t const* round = LOWMC_INSTANCE.rounds;
 #endif
    SBOX(sbox, y, x, views, r, LOWMC_N, shares, reduced_shares);
    for (unsigned int k = 0; k < reduced_shares; ++k) {
-#if defined(M_FIXED_10)
      const word nl = CONST_BLOCK(nl_part[k], i >> 3)->w64[(i & 0x7) >> 1];
      BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
        (i & 1) ? (nl & WORD_C(0xFFFFFFFF00000000)) : (nl << 32);
-#elif defined(M_FIXED_1)
-      const word nl = CONST_BLOCK(nl_part[k], i / (4 * 21))->w64[(i % (4 * 21)) / 21];
-      BLOCK(y[k], 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^= (nl << ((20-(i%21))*3)) & WORD_C(0xE000000000000000);
-#endif
    }
    MPC_LOOP_CONST(MUL, x, y, round->l_matrix, reduced_shares);
  }
--- a/src/sig/picnic/external/mzd_additional.c
+++ b/src/sig/picnic/external/mzd_additional.c
@ -39,7 +39,7 @@ static_assert(((sizeof(mzd_local_t) + 0x1f) & ~0x1f) == 32, "sizeof mzd_local_t
 #endif
 static const unsigned int align_bound = 128 / (8 * sizeof(word));

-static uint32_t calculate_rowstride(uint32_t width) {
+static size_t calculate_rowstride(size_t width) {
  // As soon as we hit the AVX bound, use 32 byte alignment. Otherwise use 16
  // byte alignment for SSE2 and 128 bit vectors.
  if (width > align_bound) {
@ -49,7 +49,7 @@ static uint32_t calculate_rowstride(uint32_t width) {
  }
 }

-static uint32_t calculate_width(uint32_t c) {
+static size_t calculate_width(size_t c) {
  return (c + sizeof(word) * 8 - 1) / (sizeof(word) * 8);
 }

@ -62,8 +62,7 @@ static uint32_t calculate_width(uint32_t c) {
 // memory block.

 mzd_local_t* mzd_local_init_ex(uint32_t r, uint32_t c, bool clear) {
-  const uint32_t width     = calculate_width(c);
-  const uint32_t rowstride = calculate_rowstride(width);
+  const size_t rowstride = calculate_rowstride(calculate_width(c));

  const size_t buffer_size = r * rowstride * sizeof(word);
  const size_t alloc_size  = (buffer_size + 31) & ~31;
@ -84,8 +83,7 @@ void mzd_local_free(mzd_local_t* v) {
 }

 void mzd_local_init_multiple_ex(mzd_local_t** dst, size_t n, uint32_t r, uint32_t c, bool clear) {
-  const uint32_t width     = calculate_width(c);
-  const uint32_t rowstride = calculate_rowstride(width);
+  const size_t rowstride = calculate_rowstride(calculate_width(c));

  const size_t buffer_size   = r * rowstride * sizeof(word);
  const size_t size_per_elem = (buffer_size + 31) & ~31;
@ -193,25 +191,11 @@ void mzd_xor_s128_640(mzd_local_t* res, mzd_local_t const* first, mzd_local_t co
      mm128_xor(CONST_BLOCK(first, 2)->w128[0], CONST_BLOCK(second, 2)->w128[0]);
 }

-ATTR_TARGET_S128
-void mzd_xor_s128_896(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
-  mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 3);
-  BLOCK(res, 3)->w128[0] =
-      mm128_xor(CONST_BLOCK(first, 3)->w128[0], CONST_BLOCK(second, 3)->w128[0]);
-}
-
 ATTR_TARGET_S128
 void mzd_xor_s128_1024(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
  mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
 }

-ATTR_TARGET_S128
-void mzd_xor_s128_1152(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
-  mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
-  BLOCK(res, 4)->w128[0] =
-      mm128_xor(CONST_BLOCK(first, 4)->w128[0], CONST_BLOCK(second, 4)->w128[0]);
-}
-
 ATTR_TARGET_S128
 void mzd_xor_s128_1280(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
  mzd_xor_s128_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 5);
@ -282,31 +266,16 @@ void mzd_xor_uint64_256(mzd_local_t* res, mzd_local_t const* first, mzd_local_t
  mzd_xor_uint64_block(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
 }

-void mzd_xor_uint64_576(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
-  mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 2);
-  mzd_xor_uint64_block(BLOCK(res, 2), CONST_BLOCK(first, 2), CONST_BLOCK(second, 2), 1);
-}
-
 void mzd_xor_uint64_640(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
  mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 2);
  mzd_xor_uint64_block(BLOCK(res, 2), CONST_BLOCK(first, 2), CONST_BLOCK(second, 2), 2);
 }

-void mzd_xor_uint64_896(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
-  mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 3);
-  mzd_xor_uint64_block(BLOCK(res, 3), CONST_BLOCK(first, 3), CONST_BLOCK(second, 3), 2);
-}
-
 void mzd_xor_uint64_960(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
  mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 3);
  mzd_xor_uint64_block(BLOCK(res, 3), CONST_BLOCK(first, 3), CONST_BLOCK(second, 3), 3);
 }

-void mzd_xor_uint64_1152(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
-  mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
-  mzd_xor_uint64_block(BLOCK(res, 4), CONST_BLOCK(first, 4), CONST_BLOCK(second, 4), 2);
-}
-
 void mzd_xor_uint64_1216(mzd_local_t* res, mzd_local_t const* first, mzd_local_t const* second) {
  mzd_xor_uint64_blocks(BLOCK(res, 0), CONST_BLOCK(first, 0), CONST_BLOCK(second, 0), 4);
  mzd_xor_uint64_block(BLOCK(res, 4), CONST_BLOCK(first, 4), CONST_BLOCK(second, 4), 3);
@ -368,63 +337,6 @@ void mzd_mul_v_parity_uint64_256_30(mzd_local_t* c, mzd_local_t const* v, mzd_lo
  cblock->w64[3] = res;
 }

-void mzd_mul_v_parity_uint64_128_3(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* vblock = CONST_BLOCK(v, 0);
-
-  cblock->w64[0] = 0;
-
-  const block_t* Ablock1 = CONST_BLOCK(At, 0);
-  const block_t* Ablock2 = CONST_BLOCK(At, 1);
-
-  const word parity1 =
-      parity64_uint64((vblock->w64[0] & Ablock1->w64[0]) ^ (vblock->w64[1] & Ablock1->w64[1]));
-  const word parity2 =
-      parity64_uint64((vblock->w64[0] & Ablock1->w64[2]) ^ (vblock->w64[1] & Ablock1->w64[3]));
-  const word parity3 =
-      parity64_uint64((vblock->w64[0] & Ablock2->w64[0]) ^ (vblock->w64[1] & Ablock2->w64[1]));
-
-  cblock->w64[1] = (parity1 | (parity2 << 1) | (parity3 << 2)) << 61;
-}
-
-void mzd_mul_v_parity_uint64_192_3(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* vblock = CONST_BLOCK(v, 0);
-
-  for (unsigned int j = 0; j < 3; j++) {
-    cblock->w64[j] = 0;
-  }
-
-  word res = 0;
-  for (unsigned int i = 3; i; --i) {
-    const block_t* Ablock = CONST_BLOCK(At, 3 - i);
-    const word parity =
-        parity64_uint64((vblock->w64[0] & Ablock->w64[0]) ^ (vblock->w64[1] & Ablock->w64[1]) ^
-                        (vblock->w64[2] & Ablock->w64[2]));
-    res |= parity << (64 - i);
-  }
-  cblock->w64[2] = res;
-}
-
-void mzd_mul_v_parity_uint64_256_3(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* vblock = CONST_BLOCK(v, 0);
-
-  for (unsigned int j = 0; j < 3; j++) {
-    cblock->w64[j] = 0;
-  }
-
-  word res = 0;
-  for (unsigned int i = 3; i; --i) {
-    const block_t* Ablock = CONST_BLOCK(At, 3 - i);
-    const word parity =
-        parity64_uint64((vblock->w64[0] & Ablock->w64[0]) ^ (vblock->w64[1] & Ablock->w64[1]) ^
-                        (vblock->w64[2] & Ablock->w64[2]) ^ (vblock->w64[3] & Ablock->w64[3]));
-    res |= parity << (64 - i);
-  }
-  cblock->w64[3] = res;
-}
-
 #if defined(WITH_OPT)

 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -573,37 +485,6 @@ void mzd_mul_v_s128_128_640(mzd_local_t* c, mzd_local_t const* v, mzd_local_t co
  cblock3->w128[0] = cval[4];
 }

-ATTR_TARGET_S128
-void mzd_mul_v_s128_192_896(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  const word* vptr      = CONST_BLOCK(v, 0)->w64;
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  word128 cval[7] ATTR_ALIGNED(alignof(word128)) = {mm128_zero, mm128_zero, mm128_zero, mm128_zero,
-                                                    mm128_zero, mm128_zero, mm128_zero};
-  for (unsigned int w = 3; w; --w, ++vptr) {
-    word idx = *vptr;
-    for (unsigned int i = sizeof(word) * 8; i; i -= 1, idx >>= 1, Ablock += 4) {
-      const word128 mask = mm128_compute_mask(idx, 0);
-      mm128_xor_mask_region(&cval[0], Ablock[0].w128, mask, 2);
-      mm128_xor_mask_region(&cval[2], Ablock[1].w128, mask, 2);
-      mm128_xor_mask_region(&cval[4], Ablock[2].w128, mask, 2);
-      cval[6] = mm128_xor_mask(cval[6], Ablock[3].w128[0], mask);
-    }
-  }
-
-  block_t* cblock1 = BLOCK(c, 0);
-  block_t* cblock2 = BLOCK(c, 1);
-  block_t* cblock3 = BLOCK(c, 2);
-  block_t* cblock4 = BLOCK(c, 3);
-  cblock1->w128[0] = cval[0];
-  cblock1->w128[1] = cval[1];
-  cblock2->w128[0] = cval[2];
-  cblock2->w128[1] = cval[3];
-  cblock3->w128[0] = cval[4];
-  cblock3->w128[1] = cval[5];
-  cblock4->w128[0] = cval[6];
-}
-
 ATTR_TARGET_S128
 void mzd_mul_v_s128_192_1024(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
  const word* vptr      = CONST_BLOCK(v, 0)->w64;
@ -636,42 +517,6 @@ void mzd_mul_v_s128_192_1024(mzd_local_t* c, mzd_local_t const* v, mzd_local_t c
  cblock4->w128[1] = cval[7];
 }

-ATTR_TARGET_S128
-void mzd_mul_v_s128_256_1152(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  const word* vptr      = CONST_BLOCK(v, 0)->w64;
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  word128 cval[9] ATTR_ALIGNED(alignof(word128)) = {mm128_zero, mm128_zero, mm128_zero,
-                                                    mm128_zero, mm128_zero, mm128_zero,
-                                                    mm128_zero, mm128_zero, mm128_zero};
-  for (unsigned int w = 4; w; --w, ++vptr) {
-    word idx = *vptr;
-    for (unsigned int i = sizeof(word) * 8; i; i -= 1, idx >>= 1, Ablock += 5) {
-      const word128 mask = mm128_compute_mask(idx, 0);
-      mm128_xor_mask_region(&cval[0], Ablock[0].w128, mask, 2);
-      mm128_xor_mask_region(&cval[2], Ablock[1].w128, mask, 2);
-      mm128_xor_mask_region(&cval[4], Ablock[2].w128, mask, 2);
-      mm128_xor_mask_region(&cval[6], Ablock[3].w128, mask, 2);
-      cval[8] = mm128_xor_mask(cval[8], Ablock[4].w128[0], mask);
-    }
-  }
-
-  block_t* cblock1 = BLOCK(c, 0);
-  block_t* cblock2 = BLOCK(c, 1);
-  block_t* cblock3 = BLOCK(c, 2);
-  block_t* cblock4 = BLOCK(c, 3);
-  block_t* cblock5 = BLOCK(c, 4);
-  cblock1->w128[0] = cval[0];
-  cblock1->w128[1] = cval[1];
-  cblock2->w128[0] = cval[2];
-  cblock2->w128[1] = cval[3];
-  cblock3->w128[0] = cval[4];
-  cblock3->w128[1] = cval[5];
-  cblock4->w128[0] = cval[6];
-  cblock4->w128[1] = cval[7];
-  cblock5->w128[0] = cval[8];
-}
-
 ATTR_TARGET_S128
 void mzd_mul_v_s128_256_1280(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
  const word* vptr      = CONST_BLOCK(v, 0)->w64;
@ -729,7 +574,7 @@ void mzd_addmul_v_s256_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t con
  const word* vptr      = CONST_BLOCK(v, 0)->w64;
  const block_t* Ablock = CONST_BLOCK(A, 0);

-  word256 cval[2] ATTR_ALIGNED(alignof(word256)) = {_mm256_castsi128_si256(cblock->w128[0]),
+  word256 cval[2] ATTR_ALIGNED(alignof(word256)) = {_mm256_setr_m128i(cblock->w128[0], mm128_zero),
                                                    mm256_zero};
  for (unsigned int w = 2; w; --w, ++vptr) {
    word idx = *vptr;
@ -987,25 +832,6 @@ void mzd_mul_v_uint64_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t cons
  mzd_addmul_v_uint64_256(c, v, A);
 }

-void mzd_mul_v_uint64_128_576(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  const word* vptr      = CONST_BLOCK(v, 0)->w64;
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  clear_uint64_blocks(BLOCK(c, 0), 2);
-  clear_uint64_block(BLOCK(c, 2), 1);
-
-  for (unsigned int w = 2; w; --w, ++vptr) {
-    word idx = *vptr;
-    for (unsigned int i = sizeof(word) * 8; i; --i, idx >>= 1, ++Ablock) {
-      const uint64_t mask = -(idx & 1);
-      for (unsigned int j = 0; j < 2; ++j, ++Ablock) {
-        mzd_xor_mask_uint64_block(BLOCK(c, j), Ablock, mask, 4);
-      }
-      mzd_xor_mask_uint64_block(BLOCK(c, 2), Ablock, mask, 1);
-    }
-  }
-}
-
 void mzd_mul_v_uint64_128_640(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
  const word* vptr      = CONST_BLOCK(v, 0)->w64;
  const block_t* Ablock = CONST_BLOCK(A, 0);
@ -1025,25 +851,6 @@ void mzd_mul_v_uint64_128_640(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
  }
 }

-void mzd_mul_v_uint64_192_896(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  const word* vptr      = CONST_BLOCK(v, 0)->w64;
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  clear_uint64_blocks(BLOCK(c, 0), 3);
-  clear_uint64_block(BLOCK(c, 3), 2);
-
-  for (unsigned int w = 3; w; --w, ++vptr) {
-    word idx = *vptr;
-    for (unsigned int i = sizeof(word) * 8; i; --i, idx >>= 1, ++Ablock) {
-      const uint64_t mask = -(idx & 1);
-      for (unsigned int j = 0; j < 3; ++j, ++Ablock) {
-        mzd_xor_mask_uint64_block(BLOCK(c, j), Ablock, mask, 4);
-      }
-      mzd_xor_mask_uint64_block(BLOCK(c, 3), Ablock, mask, 2);
-    }
-  }
-}
-
 void mzd_mul_v_uint64_192_960(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
  const word* vptr      = CONST_BLOCK(v, 0)->w64;
  const block_t* Ablock = CONST_BLOCK(A, 0);
@ -1063,25 +870,6 @@ void mzd_mul_v_uint64_192_960(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
  }
 }

-void mzd_mul_v_uint64_256_1152(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  const word* vptr      = CONST_BLOCK(v, 0)->w64;
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  clear_uint64_blocks(BLOCK(c, 0), 4);
-  clear_uint64_block(BLOCK(c, 4), 2);
-
-  for (unsigned int w = 4; w; --w, ++vptr) {
-    word idx = *vptr;
-    for (unsigned int i = sizeof(word) * 8; i; --i, idx >>= 1, ++Ablock) {
-      const uint64_t mask = -(idx & 1);
-      for (unsigned int j = 0; j < 4; ++j, ++Ablock) {
-        mzd_xor_mask_uint64_block(BLOCK(c, j), Ablock, mask, 4);
-      }
-      mzd_xor_mask_uint64_block(BLOCK(c, 4), Ablock, mask, 2);
-    }
-  }
-}
-
 void mzd_mul_v_uint64_256_1216(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
  const word* vptr      = CONST_BLOCK(v, 0)->w64;
  const block_t* Ablock = CONST_BLOCK(A, 0);
@ -1130,24 +918,6 @@ void mzd_shuffle_256_30(mzd_local_t* x, const word mask) {
  mzd_shuffle_30_idx(x, mask, 3);
 }

-static inline void mzd_shuffle_3_idx(mzd_local_t* x, const word mask, unsigned int idx) {
-  const word w          = CONST_BLOCK(x, 0)->w64[idx];
-  const word a          = extract_bits(w, mask) << 61;
-  BLOCK(x, 0)->w64[idx] = a | extract_bits(w, ~mask);
-}
-
-void mzd_shuffle_128_3(mzd_local_t* x, const word mask) {
-  mzd_shuffle_3_idx(x, mask, 1);
-}
-
-void mzd_shuffle_192_3(mzd_local_t* x, const word mask) {
-  mzd_shuffle_3_idx(x, mask, 2);
-}
-
-void mzd_shuffle_256_3(mzd_local_t* x, const word mask) {
-  mzd_shuffle_3_idx(x, mask, 3);
-}
-
 // no SIMD
 void mzd_addmul_v_uint64_30_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
  block_t* cblock       = BLOCK(c, 0);
@ -1185,44 +955,6 @@ void mzd_addmul_v_uint64_30_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_
  }
 }

-void mzd_addmul_v_uint64_3_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  block_t* cblock        = BLOCK(c, 0);
-  const block_t* Ablock1 = CONST_BLOCK(A, 0);
-  const block_t* Ablock2 = CONST_BLOCK(A, 1);
-
-  const word idx       = CONST_BLOCK(v, 0)->w64[1] >> 61;
-  const uint64_t mask1 = -(idx & 1);
-  const uint64_t mask2 = -((idx >> 1) & 1);
-  const uint64_t mask3 = -((idx >> 2) & 1);
-
-  for (unsigned int j = 0; j < 2; ++j) {
-    cblock->w64[j] ^=
-        (Ablock1->w64[j] & mask1) ^ (Ablock1->w64[j + 2] & mask2) ^ (Ablock2->w64[j] & mask3);
-  }
-}
-
-void mzd_addmul_v_uint64_3_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  word idx = CONST_BLOCK(v, 0)->w64[2] >> 61;
-  for (unsigned int i = 3; i; --i, idx >>= 1, ++Ablock) {
-    const uint64_t mask = -(idx & 1);
-    mzd_xor_mask_uint64_block(cblock, Ablock, mask, 3);
-  }
-}
-
-void mzd_addmul_v_uint64_3_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  word idx = CONST_BLOCK(v, 0)->w64[3] >> 61;
-  for (unsigned int i = 3; i; --i, idx >>= 1, ++Ablock) {
-    const uint64_t mask = -(idx & 1);
-    mzd_xor_mask_uint64_block(cblock, Ablock, mask, 4);
-  }
-}
-
 #if defined(WITH_SSE2) || defined(WITH_NEON)
 ATTR_TARGET_S128
 void mzd_addmul_v_s128_30_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
@ -1270,43 +1002,6 @@ void mzd_addmul_v_s128_30_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
  mzd_addmul_v_s128_30_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 34);
 }

-ATTR_TARGET_S128
-void mzd_addmul_v_s128_3_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-  const word idx        = CONST_BLOCK(v, 0)->w64[1] >> 61;
-
-  word128 cval[2] ATTR_ALIGNED(alignof(word128));
-  cval[0]         = mm128_xor_mask(cblock->w128[0], Ablock[0].w128[0], mm128_compute_mask(idx, 0));
-  cval[1]         = mm128_and(Ablock[0].w128[1], mm128_compute_mask(idx, 1));
-  cval[0]         = mm128_xor_mask(cval[0], Ablock[1].w128[0], mm128_compute_mask(idx, 2));
-  cblock->w128[0] = mm128_xor(cval[0], cval[1]);
-}
-
-ATTR_TARGET_S128
-static void mzd_addmul_v_s128_3_256_idx(mzd_local_t* c, mzd_local_t const* A, const word idx) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  word128 cval[4] ATTR_ALIGNED(alignof(word128)) = {cblock->w128[0], cblock->w128[1], mm128_zero,
-                                                    mm128_zero};
-  mm128_xor_mask_region(&cval[0], Ablock[0].w128, mm128_compute_mask(idx, 0), 2);
-  mm128_xor_mask_region(&cval[2], Ablock[1].w128, mm128_compute_mask(idx, 1), 2);
-  mm128_xor_mask_region(&cval[0], Ablock[2].w128, mm128_compute_mask(idx, 2), 2);
-
-  cblock->w128[0] = mm128_xor(cval[0], cval[2]);
-  cblock->w128[1] = mm128_xor(cval[1], cval[3]);
-}
-
-ATTR_TARGET_S128
-void mzd_addmul_v_s128_3_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  mzd_addmul_v_s128_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[2] >> 61);
-}
-
-ATTR_TARGET_S128
-void mzd_addmul_v_s128_3_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  mzd_addmul_v_s128_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 61);
-}
 #endif

 #if defined(WITH_AVX2)
@ -1318,7 +1013,7 @@ void mzd_addmul_v_s256_30_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
  word idx              = vblock->w64[1] >> 34;

  word256 cval[2] ATTR_ALIGNED(alignof(word256));
-  cval[0] = mm256_xor_mask(_mm256_castsi128_si256(cblock->w128[0]), Ablock[0].w256,
+  cval[0] = mm256_xor_mask(_mm256_setr_m128i(cblock->w128[0], mm128_zero), Ablock[0].w256,
                           mm256_compute_mask_2(idx, 0));
  cval[1] = mm256_and(Ablock[1].w256, mm256_compute_mask_2(idx, 2));
  idx >>= 4;
@ -1367,41 +1062,6 @@ void mzd_addmul_v_s256_30_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t
  mzd_addmul_v_s256_30_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 34);
 }

-ATTR_TARGET_AVX2
-void mzd_addmul_v_s256_3_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-  const word idx        = CONST_BLOCK(v, 0)->w64[1] >> 61;
-
-  word128 cval[2] ATTR_ALIGNED(alignof(word128));
-  cval[0]         = mm128_xor_mask(cblock->w128[0], Ablock[0].w128[0], mm128_compute_mask(idx, 0));
-  cval[1]         = mm128_and(Ablock[0].w128[1], mm128_compute_mask(idx, 1));
-  cval[0]         = mm128_xor_mask(cval[0], Ablock[1].w128[0], mm128_compute_mask(idx, 2));
-  cblock->w128[0] = mm128_xor(cval[0], cval[1]);
-}
-
-ATTR_TARGET_AVX2
-static inline void mzd_addmul_v_s256_3_256_idx(mzd_local_t* c, mzd_local_t const* A, const word idx) {
-  block_t* cblock       = BLOCK(c, 0);
-  const block_t* Ablock = CONST_BLOCK(A, 0);
-
-  word256 cval[2] ATTR_ALIGNED(alignof(word256));
-  cval[0]      = mm256_xor_mask(cblock->w256, Ablock[0].w256, mm256_compute_mask(idx, 0));
-  cval[1]      = mm256_and(Ablock[1].w256, mm256_compute_mask(idx, 1));
-  cval[0]      = mm256_xor_mask(cval[0], Ablock[2].w256, mm256_compute_mask(idx, 2));
-  cblock->w256 = mm256_xor(cval[0], cval[1]);
-}
-
-ATTR_TARGET_AVX2
-void mzd_addmul_v_s256_3_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  mzd_addmul_v_s256_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[2] >> 61);
-}
-
-ATTR_TARGET_AVX2
-void mzd_addmul_v_s256_3_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) {
-  mzd_addmul_v_s256_3_256_idx(c, A, CONST_BLOCK(v, 0)->w64[3] >> 61);
-}
-
 #if !defined(__x86_64__) && !defined(_M_X64)
 ATTR_TARGET_AVX2 ATTR_CONST static uint8_t popcount_32(uint32_t value) {
  uint64_t result =
@ -1441,27 +1101,5 @@ ATTR_TARGET_AVX2
 void mzd_shuffle_pext_256_30(mzd_local_t* x, const word mask) {
  mzd_shuffle_pext_30_idx(x, mask, 3);
 }
-
-ATTR_TARGET_AVX2
-static inline void mzd_shuffle_pext_3_idx(mzd_local_t* x, const word mask, unsigned int idx) {
-  const word w          = CONST_BLOCK(x, 0)->w64[idx];
-  const word a          = _pext_u64(w, mask) << 61;
-  BLOCK(x, 0)->w64[idx] = a | _pext_u64(w, ~mask);
-}
-
-ATTR_TARGET_AVX2
-void mzd_shuffle_pext_128_3(mzd_local_t* x, const word mask) {
-  mzd_shuffle_pext_3_idx(x, mask, 1);
-}
-
-ATTR_TARGET_AVX2
-void mzd_shuffle_pext_192_3(mzd_local_t* x, const word mask) {
-  mzd_shuffle_pext_3_idx(x, mask, 2);
-}
-
-ATTR_TARGET_AVX2
-void mzd_shuffle_pext_256_3(mzd_local_t* x, const word mask) {
-  mzd_shuffle_pext_3_idx(x, mask, 3);
-}
 #endif
 #endif
--- a/src/sig/picnic/external/mzd_additional.h
+++ b/src/sig/picnic/external/mzd_additional.h
@ -77,16 +77,10 @@ void mzd_xor_uint64_192(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_uint64_256(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
-void mzd_xor_uint64_576(mzd_local_t* res, mzd_local_t const* first,
-                        mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_uint64_640(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
-void mzd_xor_uint64_896(mzd_local_t* res, mzd_local_t const* first,
-                        mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_uint64_960(mzd_local_t* res, mzd_local_t const* first,
                        mzd_local_t const* second) ATTR_NONNULL;
-void mzd_xor_uint64_1152(mzd_local_t* res, mzd_local_t const* first,
-                         mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_uint64_1216(mzd_local_t* res, mzd_local_t const* first,
                         mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_s128_128(mzd_local_t* res, mzd_local_t const* first,
@ -95,12 +89,8 @@ void mzd_xor_s128_256(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_s128_640(mzd_local_t* res, mzd_local_t const* first,
                      mzd_local_t const* second) ATTR_NONNULL;
-void mzd_xor_s128_896(mzd_local_t* res, mzd_local_t const* first,
-                      mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_s128_1024(mzd_local_t* res, mzd_local_t const* first,
                       mzd_local_t const* second) ATTR_NONNULL;
-void mzd_xor_s128_1152(mzd_local_t* res, mzd_local_t const* first,
-                       mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_s128_1280(mzd_local_t* res, mzd_local_t const* first,
                       mzd_local_t const* second) ATTR_NONNULL;
 void mzd_xor_s256_128(mzd_local_t* res, mzd_local_t const* first,
@ -120,16 +110,10 @@ void mzd_xor_s256_1280(mzd_local_t* res, mzd_local_t const* first,
 void mzd_mul_v_uint64_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
 void mzd_mul_v_uint64_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
 void mzd_mul_v_uint64_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* At) ATTR_NONNULL;
-void mzd_mul_v_uint64_128_576(mzd_local_t* c, mzd_local_t const* v,
-                              mzd_local_t const* At) ATTR_NONNULL;
 void mzd_mul_v_uint64_128_640(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* At) ATTR_NONNULL;
-void mzd_mul_v_uint64_192_896(mzd_local_t* c, mzd_local_t const* v,
-                              mzd_local_t const* At) ATTR_NONNULL;
 void mzd_mul_v_uint64_192_960(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* At) ATTR_NONNULL;
-void mzd_mul_v_uint64_256_1152(mzd_local_t* c, mzd_local_t const* v,
-                               mzd_local_t const* At) ATTR_NONNULL;
 void mzd_mul_v_uint64_256_1216(mzd_local_t* c, mzd_local_t const* v,
                               mzd_local_t const* At) ATTR_NONNULL;
 void mzd_mul_v_s128_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
@ -137,12 +121,8 @@ void mzd_mul_v_s128_192(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const*
 void mzd_mul_v_s128_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
 void mzd_mul_v_s128_128_640(mzd_local_t* c, mzd_local_t const* v,
                            mzd_local_t const* A) ATTR_NONNULL;
-void mzd_mul_v_s128_192_896(mzd_local_t* c, mzd_local_t const* v,
-                            mzd_local_t const* A) ATTR_NONNULL;
 void mzd_mul_v_s128_192_1024(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
-void mzd_mul_v_s128_256_1152(mzd_local_t* c, mzd_local_t const* v,
-                             mzd_local_t const* A) ATTR_NONNULL;
 void mzd_mul_v_s128_256_1280(mzd_local_t* c, mzd_local_t const* v,
                             mzd_local_t const* A) ATTR_NONNULL;
 void mzd_mul_v_s256_128(mzd_local_t* c, mzd_local_t const* v, mzd_local_t const* A) ATTR_NONNULL;
@ -165,12 +145,6 @@ void mzd_addmul_v_uint64_30_192(mzd_local_t* c, mzd_local_t const* v,
                                mzd_local_t const* A) ATTR_NONNULL;
 void mzd_addmul_v_uint64_30_256(mzd_local_t* c, mzd_local_t const* v,
                                mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_uint64_3_128(mzd_local_t* c, mzd_local_t const* v,
-                               mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_uint64_3_192(mzd_local_t* c, mzd_local_t const* v,
-                               mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_uint64_3_256(mzd_local_t* c, mzd_local_t const* v,
-                               mzd_local_t const* A) ATTR_NONNULL;

 /**
 * Use SSE2 or NEON
@ -181,12 +155,6 @@ void mzd_addmul_v_s128_30_192(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
 void mzd_addmul_v_s128_30_256(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_s128_3_128(mzd_local_t* c, mzd_local_t const* v,
-                             mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_s128_3_192(mzd_local_t* c, mzd_local_t const* v,
-                             mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_s128_3_256(mzd_local_t* c, mzd_local_t const* v,
-                             mzd_local_t const* A) ATTR_NONNULL;

 /**
 * Use AVX2
@ -197,12 +165,6 @@ void mzd_addmul_v_s256_30_192(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
 void mzd_addmul_v_s256_30_256(mzd_local_t* c, mzd_local_t const* v,
                              mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_s256_3_128(mzd_local_t* c, mzd_local_t const* v,
-                             mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_s256_3_192(mzd_local_t* c, mzd_local_t const* v,
-                             mzd_local_t const* A) ATTR_NONNULL;
-void mzd_addmul_v_s256_3_256(mzd_local_t* c, mzd_local_t const* v,
-                             mzd_local_t const* A) ATTR_NONNULL;

 /**
 * Compute using parity based algorithm
@ -213,12 +175,6 @@ void mzd_mul_v_parity_uint64_192_30(mzd_local_t* c, mzd_local_t const* v,
                                    mzd_local_t const* A) ATTR_NONNULL;
 void mzd_mul_v_parity_uint64_256_30(mzd_local_t* c, mzd_local_t const* v,
                                    mzd_local_t const* A) ATTR_NONNULL;
-void mzd_mul_v_parity_uint64_128_3(mzd_local_t* c, mzd_local_t const* v,
-                                   mzd_local_t const* A) ATTR_NONNULL;
-void mzd_mul_v_parity_uint64_192_3(mzd_local_t* c, mzd_local_t const* v,
-                                   mzd_local_t const* A) ATTR_NONNULL;
-void mzd_mul_v_parity_uint64_256_3(mzd_local_t* c, mzd_local_t const* v,
-                                   mzd_local_t const* A) ATTR_NONNULL;

 /**
 * Compute c + v * A optimized for c and v being vectors.
@ -240,17 +196,11 @@ void mzd_addmul_v_s256_256(mzd_local_t* c, mzd_local_t const* v, mzd_local_t con
 * Shuffle vector x according to info in mask. Needed for OLLE optimiztaions.
 */
 void mzd_shuffle_128_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
-void mzd_shuffle_128_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
 void mzd_shuffle_192_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
-void mzd_shuffle_192_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
 void mzd_shuffle_256_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
-void mzd_shuffle_256_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
 void mzd_shuffle_pext_128_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
-void mzd_shuffle_pext_128_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
 void mzd_shuffle_pext_192_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
-void mzd_shuffle_pext_192_3(mzd_local_t* x, const word mask) ATTR_NONNULL;
 void mzd_shuffle_pext_256_30(mzd_local_t* x, const word mask) ATTR_NONNULL;
-void mzd_shuffle_pext_256_3(mzd_local_t* x, const word mask) ATTR_NONNULL;

 #define BLOCK(v, b) ((block_t*)ASSUME_ALIGNED(&(v)[(b)], 32))
 #define CONST_BLOCK(v, b) ((const block_t*)ASSUME_ALIGNED(&(v)[(b)], 32))
--- a/src/sig/picnic/external/picnic.c
+++ b/src/sig/picnic/external/picnic.c
@ -261,18 +261,6 @@ int PICNIC_CALLING_CONVENTION picnic_verify(const picnic_publickey_t* pk, const

 const char* PICNIC_CALLING_CONVENTION picnic_get_param_name(picnic_params_t parameters) {
  switch (parameters) {
-  case Picnic_L1_1_FS:
-    return "Picnic_L1_1_FS";
-  case Picnic_L1_1_UR:
-    return "Picnic_L1_1_UR";
-  case Picnic_L3_1_FS:
-    return "Picnic_L3_1_FS";
-  case Picnic_L3_1_UR:
-    return "Picnic_L3_1_UR";
-  case Picnic_L5_1_FS:
-    return "Picnic_L5_1_FS";
-  case Picnic_L5_1_UR:
-    return "Picnic_L5_1_UR";
  case Picnic_L1_FS:
    return "Picnic_L1_FS";
  case Picnic_L1_UR:
--- a/src/sig/picnic/external/picnic.h
+++ b/src/sig/picnic/external/picnic.h
@ -88,13 +88,6 @@ typedef enum {
  Picnic2_L1_FS, // 7
  Picnic2_L3_FS, // 8
  Picnic2_L5_FS, // 9
-  /* Instances with LowMC m=1 */
-  Picnic_L1_1_FS, // 10
-  Picnic_L1_1_UR, // 11
-  Picnic_L3_1_FS, // 12
-  Picnic_L3_1_UR, // 13
-  Picnic_L5_1_FS, // 14
-  Picnic_L5_1_UR, // 15
  PARAMETER_SET_MAX_INDEX
 } picnic_params_t;

--- a/src/sig/picnic/external/picnic2_impl.c
+++ b/src/sig/picnic/external/picnic2_impl.c
@ -34,12 +34,6 @@
 #define LOWMC_MAX_AND_GATES (3 * 38 * 10 + 4) /* Rounded to nearest byte */
 #define MAX_AUX_BYTES ((LOWMC_MAX_AND_GATES + LOWMC_MAX_KEY_BITS) / 8 + 1)

-#if defined(__WIN32__)
-#define SIZET_FMT "%Iu"
-#else
-#define SIZET_FMT "%zu"
-#endif
-
 /* Helper functions */

 ATTR_CONST
@ -133,11 +127,11 @@ void sbox_layer_10_uint64_aux(uint64_t* d, randomTape_t* tapes) {
  aux_mpc_AND_bitsliced(x0s, x1s, x2m, &ab, &bc, &ca, tapes);

  // (b & c) ^ a
-  const uint64_t t0 = (bc) ^ x0s;
+  const uint64_t t0 = bc ^ x0s;
  // (c & a) ^ a ^ b
-  const uint64_t t1 = (ca) ^ x0s ^ x1s;
-  // (a & b) ^ a ^ b ^c
-  const uint64_t t2 = (ab) ^ x0s ^ x1s ^ x2m;
+  const uint64_t t1 = ca ^ x0s ^ x1s;
+  // (a & b) ^ a ^ b ^
+  const uint64_t t2 = ab ^ x0s ^ x1s ^ x2m;

  *d = (in & MASK_MASK) ^ (t0 >> 2) ^ (t1 >> 1) ^ t2;
 }
@ -150,9 +144,7 @@ void sbox_layer_10_uint64_aux(uint64_t* d, randomTape_t* tapes) {
 static void computeAuxTape(randomTape_t* tapes, const picnic_instance_t* params) {
  mzd_local_t* lowmc_key = mzd_local_init_ex(params->lowmc->n, 1, true);

-  uint8_t temp[32] = {
-      0,
-  };
+  uint8_t temp[32] = {0};

  // combine into key shares and calculate lowmc evaluation in plain
  for (size_t i = 0; i < params->num_MPC_parties; i++) {
@ -182,8 +174,7 @@ static void commit(uint8_t* digest, const uint8_t* seed, const uint8_t* aux, con
  hash_init(&ctx, params->digest_size);
  hash_update(&ctx, seed, params->seed_size);
  if (aux != NULL) {
-    size_t tapeLenBytes = params->view_size;
-    hash_update(&ctx, aux, tapeLenBytes);
+    hash_update(&ctx, aux, params->view_size);
  }
  hash_update(&ctx, salt, SALT_SIZE);
  hash_update_uint16_le(&ctx, t);
@ -202,7 +193,7 @@ static void commit_x4(uint8_t** digest, const uint8_t** seed, const uint8_t* sal
  const uint8_t* salt_ptr[4] = {salt, salt, salt, salt};
  hash_update_x4(&ctx, salt_ptr, SALT_SIZE);
  hash_update_x4_uint16_le(&ctx, t);
-  const uint16_t j_arr[4]                     = {j + 0, j + 1, j + 2, j + 3};
+  const uint16_t j_arr[4] = {j + 0, j + 1, j + 2, j + 3};
  hash_update_x4_uint16s_le(&ctx, j_arr);
  hash_final_x4(&ctx);
  hash_squeeze_x4(&ctx, digest, params->digest_size);
@ -315,7 +306,6 @@ static size_t bitsToChunks(size_t chunkLenBits, const uint8_t* input, size_t inp
      chunks[i] += getBit(input, i * chunkLenBits + j) << j;
      assert(chunks[i] < (1 << chunkLenBits));
    }
-    chunks[i] = le16toh(chunks[i]);
  }

  return chunkCount;
--- a/src/sig/picnic/external/picnic2_simulate.c
+++ b/src/sig/picnic/external/picnic2_simulate.c
@ -93,7 +93,7 @@ static uint8_t mpc_AND(uint8_t a, uint8_t b, uint64_t mask_a, uint64_t mask_b, r

 static void mpc_sbox(mzd_local_t* statein, shares_t* state_masks, randomTape_t* tapes, msgs_t* msgs,
                     uint8_t* unopenened_msg, const picnic_instance_t* params) {
-  uint8_t state[32];
+  uint8_t state[MAX_LOWMC_BLOCK_SIZE];
  mzd_to_char_array(state, statein, params->lowmc->n / 8);
  for (size_t i = 0; i < params->lowmc->m * 3; i += 3) {
    uint8_t a       = getBit((uint8_t*)state, i + 2);
@ -173,6 +173,7 @@ static void mpc_xor_masks(shares_t* out, const shares_t* a, const shares_t* b) {
 }
 #endif

+#if !defined(NO_UINT64_FALLBACK)
 /* PICNIC2_L1_FS */
 #define XOR mzd_xor_uint64_128
 #define MPC_MUL mpc_matrix_mul_uint64_128
@ -247,6 +248,7 @@ static void mpc_xor_masks(shares_t* out, const shares_t* a, const shares_t* b) {
 #undef LOWMC_R
 #undef LOWMC_INSTANCE
 #undef SIM_ONLINE
+#endif

 #if defined(WITH_OPT)
 #if defined(WITH_SSE2) || defined(WITH_NEON)
@ -413,11 +415,7 @@ static void mpc_xor_masks(shares_t* out, const shares_t* a, const shares_t* b) {
 #endif // WITH_OPT

 lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t* lowmc) {
-#if defined(WITH_LOWMC_M1)
-  ASSUME(lowmc->m == 10 || lowmc->m == 1);
-#else
  ASSUME(lowmc->m == 10);
-#endif
  ASSUME(lowmc->n == 128 || lowmc->n == 192 || lowmc->n == 256);

 #if defined(WITH_OPT)
@ -441,6 +439,7 @@ lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t*
    }
  }
 #endif
+
 #if defined(WITH_SSE2) || defined(WITH_NEON)
  if (CPU_SUPPORTS_SSE2 || CPU_SUPPORTS_NEON) {
    if (lowmc->m == 10) {
@ -462,6 +461,8 @@ lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t*
  }
 #endif
 #endif
+
+#if !defined(NO_UINT64_FALLBACK)
  if (lowmc->m == 10) {
    switch (lowmc->n) {
 #if defined(WITH_LOWMC_128_128_20)
@ -478,6 +479,7 @@ lowmc_simulate_online_f lowmc_simulate_online_get_implementation(const lowmc_t*
 #endif
    }
  }
+#endif

  return NULL;
 }
--- a/src/sig/picnic/external/picnic2_simulate.c.i
+++ b/src/sig/picnic/external/picnic2_simulate.c.i
@ -13,10 +13,10 @@
 #if defined(FN_ATTR)
 FN_ATTR
 #endif
-static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_t* tapes, msgs_t* msgs,
-                      const mzd_local_t* plaintext, const uint32_t* pubKey,
+static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_t* tapes,
+                      msgs_t* msgs, const mzd_local_t* plaintext, const uint32_t* pubKey,
                      const picnic_instance_t* params) {
-  int ret                 = 0;
+  int ret = 0;
  mzd_local_t state[((LOWMC_N) + 255) / 256];
  shares_t* key_masks = allocateShares(LOWMC_N); // Make a copy to use when computing each round key
  shares_t* mask2_shares = allocateShares(LOWMC_N);
@ -34,7 +34,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
  shares_t* nl_part_masks = allocateShares(LOWMC_R * 32);

  MPC_MUL(state, maskedKey, LOWMC_INSTANCE.k0_matrix,
-          mask_shares);                                    // roundKey = maskedKey * KMatrix[0]
+          mask_shares); // roundKey = maskedKey * KMatrix[0]

  XOR(state, state, plaintext);
  XOR(state, state, LOWMC_INSTANCE.precomputed_constant_linear);
@ -44,7 +44,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
  mzd_local_t state2[((LOWMC_N) + 255) / 256];
  for (uint32_t r = 0; r < LOWMC_R - 1; r++) {
    mpc_sbox(state, mask_shares, tapes, msgs, unopened_msgs, params);
-    mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r*32 + 2, 30);
+    mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r * 32 + 2, 30);
    const word nl = CONST_BLOCK(nl_part, r >> 3)->w64[(r & 0x7) >> 1];
    BLOCK(state, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
        (nl << (1 - (r & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
@ -61,16 +61,16 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
    mpc_xor_masks(mask_shares, mask_shares, mask2_shares);
  }
  mpc_sbox(state, mask_shares, tapes, msgs, unopened_msgs, params);
-  mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, (LOWMC_R-1)*32 + 2, 30);
-  const word nl = CONST_BLOCK(nl_part, (LOWMC_R-1) >> 3)->w64[((LOWMC_R-1) & 0x7) >> 1];
+  mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, (LOWMC_R - 1) * 32 + 2, 30);
+  const word nl = CONST_BLOCK(nl_part, (LOWMC_R - 1) >> 3)->w64[((LOWMC_R - 1) & 0x7) >> 1];
  BLOCK(state, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
-        (nl << (1 - ((LOWMC_R-1) & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
+      (nl << (1 - ((LOWMC_R - 1) & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
  MPC_MUL(state, state, LOWMC_INSTANCE.zr_matrix,
          mask_shares); // state = state * LMatrix (r-1)
 #else
  for (uint32_t r = 0; r < LOWMC_R; r++) {
    mpc_sbox(state, mask_shares, tapes, msgs, unopened_msgs, params);
-    mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r*32 + 2, 30);
+    mpc_xor_masks_nl(mask_shares, mask_shares, nl_part_masks, r * 32 + 2, 30);
    const word nl = CONST_BLOCK(nl_part, r >> 3)->w64[(r & 0x7) >> 1];
    BLOCK(state, 0)->w64[(LOWMC_N) / (sizeof(word) * 8) - 1] ^=
        (nl << (1 - (r & 1)) * 32) & WORD_C(0xFFFFFFFF00000000);
@ -82,7 +82,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
 #else
  mzd_local_t roundKey[((LOWMC_N) + 255) / 256];
  MPC_MUL(roundKey, maskedKey, LOWMC_INSTANCE.k0_matrix,
-          mask_shares);                                       // roundKey = maskedKey * KMatrix[0]
+          mask_shares); // roundKey = maskedKey * KMatrix[0]
  XOR(state, roundKey, plaintext);

  shares_t* round_key_masks = allocateShares(mask_shares->numWords);
@ -111,7 +111,7 @@ static int SIM_ONLINE(mzd_local_t* maskedKey, shares_t* mask_shares, randomTape_
  }
  uint32_t output[LOWMC_N / 32];
  uint32_t outstate[LOWMC_N / 32];
-  mzd_to_char_array((uint8_t*)outstate, state, LOWMC_N/8);
+  mzd_to_char_array((uint8_t*)outstate, state, LOWMC_N / 8);
  reconstructShares(output, mask_shares);
  xor_word_array(output, output, outstate, (LOWMC_N / 32));

--- a/src/sig/picnic/external/picnic2_simulate_mul.c
+++ b/src/sig/picnic/external/picnic2_simulate_mul.c
@ -271,7 +271,7 @@ static void transpose_64_64_uint64(const uint64_t* in, uint64_t* out) {

  // copy in to out and transpose in-place
  for (uint32_t i = 0; i < 64; i++) {
-    out[i] = bswap64(in[i]);
+    out[i] = htobe64(in[i]);
  }

  for (uint32_t i = 0; i < logn; i++) {
@ -294,7 +294,7 @@ static void transpose_64_64_uint64(const uint64_t* in, uint64_t* out) {
    width /= 2;
  }
  for (uint32_t i = 0; i < 64; i++) {
-    out[i] = bswap64(out[i]);
+    out[i] = be64toh(out[i]);
  }
 }

@ -329,7 +329,7 @@ static void transpose_64_64_s128(const uint64_t* in, uint64_t* out) {
  const uint32_t logn = 6;

  // copy in to out and transpose in-place
-  word128* out128 = (word128*)out;
+  word128* out128      = (word128*)out;
  const word128* in128 = (const word128*)in;
  memcpy_bswap64_64_s128(out128, in128);

@ -399,7 +399,7 @@ static void transpose_64_64_s256(const uint64_t* in, uint64_t* out) {
  static const uint32_t logn = 6;

  const word256* in256 = (const word256*)in;
-  word256* out256 = (word256*)out;
+  word256* out256      = (word256*)out;

  // copy in to out and swap bytes
  memcpy_bswap64_64_s256(out256, in256);
@ -424,8 +424,8 @@ static void transpose_64_64_s256(const uint64_t* in, uint64_t* out) {
    width /= 2;
  }
  {
-    word128* out128        = (word128*)out;
-    const word128 mask     = mm128_broadcast_u64(TRANSPOSE_MASKS64[4]);
+    word128* out128    = (word128*)out;
+    const word128 mask = mm128_broadcast_u64(TRANSPOSE_MASKS64[4]);

    for (uint32_t j = 0; j < nswaps; j++) {
      for (uint32_t k = 0; k < width; k += 2) {
@ -541,16 +541,16 @@ void copyShares(shares_t* dst, shares_t* src) {
  memcpy(dst->shares, src->shares, dst->numWords * sizeof(dst->shares[0]));
 }

-void mpc_matrix_mul_uint64_128(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
-                               shares_t* mask_shares) {
-  const uint32_t rowstride = (128) / 8;
+void mpc_matrix_mul_uint64_128(mzd_local_t* output, const mzd_local_t* vec,
+                               const mzd_local_t* matrix, shares_t* mask_shares) {
+  const uint32_t rowstride = (128) / (sizeof(word) * 8);
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 128; i++) {
    const uint64_t mask_share = mask_shares->shares[128 - 1 - i];

    for (uint32_t j = 0; j < 128; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
+      uint8_t matrix_byte  = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];

@ -572,16 +572,16 @@ void mpc_matrix_mul_uint64_128(mzd_local_t* output, const mzd_local_t* vec, cons
  freeShares(tmp_mask);
 }

-void mpc_matrix_mul_uint64_192(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
-                               shares_t* mask_shares) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_uint64_192(mzd_local_t* output, const mzd_local_t* vec,
+                               const mzd_local_t* matrix, shares_t* mask_shares) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 192; i++) {
    const uint64_t mask_share = mask_shares->shares[192 - 1 - i];

    for (uint32_t j = 0; j < 192; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -603,16 +603,16 @@ void mpc_matrix_mul_uint64_192(mzd_local_t* output, const mzd_local_t* vec, cons
  copyShares(mask_shares, tmp_mask);
  freeShares(tmp_mask);
 }
-void mpc_matrix_mul_uint64_256(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
-                               shares_t* mask_shares) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_uint64_256(mzd_local_t* output, const mzd_local_t* vec,
+                               const mzd_local_t* matrix, shares_t* mask_shares) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 256; i++) {
    const uint64_t mask_share = mask_shares->shares[256 - 1 - i];

    for (uint32_t j = 0; j < 256; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -636,87 +636,91 @@ void mpc_matrix_mul_uint64_256(mzd_local_t* output, const mzd_local_t* vec, cons
 }

 #if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
-void mpc_matrix_mul_z_uint64_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                 const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (128) / 8;
+void mpc_matrix_mul_z_uint64_128(mzd_local_t* state2, const mzd_local_t* state,
+                                 shares_t* mask2_shares, const shares_t* mask_shares,
+                                 const mzd_local_t* matrix) {
+  const uint32_t rowstride = (128) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 128);
  for (size_t i = 0; i < 30; i++) {
    uint64_t new_mask_i = 0;
-    for (uint32_t j = 0; j < 128 / 8; j++) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (128 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 128; j += 8) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (128 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];

-      new_mask_i ^= mask_shares->shares[j * 8 + 0] & mask1->w64[0];
-      new_mask_i ^= mask_shares->shares[j * 8 + 1] & mask1->w64[1];
-      new_mask_i ^= mask_shares->shares[j * 8 + 2] & mask1->w64[2];
-      new_mask_i ^= mask_shares->shares[j * 8 + 3] & mask1->w64[3];
-      new_mask_i ^= mask_shares->shares[j * 8 + 4] & mask2->w64[0];
-      new_mask_i ^= mask_shares->shares[j * 8 + 5] & mask2->w64[1];
-      new_mask_i ^= mask_shares->shares[j * 8 + 6] & mask2->w64[2];
-      new_mask_i ^= mask_shares->shares[j * 8 + 7] & mask2->w64[3];
+      new_mask_i ^= mask_shares->shares[j + 0] & mask1->w64[0];
+      new_mask_i ^= mask_shares->shares[j + 1] & mask1->w64[1];
+      new_mask_i ^= mask_shares->shares[j + 2] & mask1->w64[2];
+      new_mask_i ^= mask_shares->shares[j + 3] & mask1->w64[3];
+      new_mask_i ^= mask_shares->shares[j + 4] & mask2->w64[0];
+      new_mask_i ^= mask_shares->shares[j + 5] & mask2->w64[1];
+      new_mask_i ^= mask_shares->shares[j + 6] & mask2->w64[2];
+      new_mask_i ^= mask_shares->shares[j + 7] & mask2->w64[3];
    }
    mask2_shares->shares[30 - 1 - i] = new_mask_i;
  }
  mzd_mul_v_parity_uint64_128_30(state2, state, matrix);
 }

-void mpc_matrix_mul_z_uint64_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                 const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_z_uint64_192(mzd_local_t* state2, const mzd_local_t* state,
+                                 shares_t* mask2_shares, const shares_t* mask_shares,
+                                 const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 192);
  for (size_t i = 0; i < 30; i++) {
    uint64_t new_mask_i = 0;
-    for (uint32_t j = 0; j < 192 / 8; j++) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (192 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 192; j += 8) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (192 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];

-      new_mask_i ^= mask_shares->shares[j * 8 + 0] & mask1->w64[0];
-      new_mask_i ^= mask_shares->shares[j * 8 + 1] & mask1->w64[1];
-      new_mask_i ^= mask_shares->shares[j * 8 + 2] & mask1->w64[2];
-      new_mask_i ^= mask_shares->shares[j * 8 + 3] & mask1->w64[3];
-      new_mask_i ^= mask_shares->shares[j * 8 + 4] & mask2->w64[0];
-      new_mask_i ^= mask_shares->shares[j * 8 + 5] & mask2->w64[1];
-      new_mask_i ^= mask_shares->shares[j * 8 + 6] & mask2->w64[2];
-      new_mask_i ^= mask_shares->shares[j * 8 + 7] & mask2->w64[3];
+      new_mask_i ^= mask_shares->shares[j + 0] & mask1->w64[0];
+      new_mask_i ^= mask_shares->shares[j + 1] & mask1->w64[1];
+      new_mask_i ^= mask_shares->shares[j + 2] & mask1->w64[2];
+      new_mask_i ^= mask_shares->shares[j + 3] & mask1->w64[3];
+      new_mask_i ^= mask_shares->shares[j + 4] & mask2->w64[0];
+      new_mask_i ^= mask_shares->shares[j + 5] & mask2->w64[1];
+      new_mask_i ^= mask_shares->shares[j + 6] & mask2->w64[2];
+      new_mask_i ^= mask_shares->shares[j + 7] & mask2->w64[3];
    }
    mask2_shares->shares[30 - 1 - i] = new_mask_i;
  }
  mzd_mul_v_parity_uint64_192_30(state2, state, matrix);
 }

-void mpc_matrix_mul_z_uint64_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                 const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_z_uint64_256(mzd_local_t* state2, const mzd_local_t* state,
+                                 shares_t* mask2_shares, const shares_t* mask_shares,
+                                 const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 256);
  for (size_t i = 0; i < 30; i++) {
    uint64_t new_mask_i = 0;
-    for (uint32_t j = 0; j < 256 / 8; j++) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (256 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 256; j += 8) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (256 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];

-      new_mask_i ^= mask_shares->shares[j * 8 + 0] & mask1->w64[0];
-      new_mask_i ^= mask_shares->shares[j * 8 + 1] & mask1->w64[1];
-      new_mask_i ^= mask_shares->shares[j * 8 + 2] & mask1->w64[2];
-      new_mask_i ^= mask_shares->shares[j * 8 + 3] & mask1->w64[3];
-      new_mask_i ^= mask_shares->shares[j * 8 + 4] & mask2->w64[0];
-      new_mask_i ^= mask_shares->shares[j * 8 + 5] & mask2->w64[1];
-      new_mask_i ^= mask_shares->shares[j * 8 + 6] & mask2->w64[2];
-      new_mask_i ^= mask_shares->shares[j * 8 + 7] & mask2->w64[3];
+      new_mask_i ^= mask_shares->shares[j + 0] & mask1->w64[0];
+      new_mask_i ^= mask_shares->shares[j + 1] & mask1->w64[1];
+      new_mask_i ^= mask_shares->shares[j + 2] & mask1->w64[2];
+      new_mask_i ^= mask_shares->shares[j + 3] & mask1->w64[3];
+      new_mask_i ^= mask_shares->shares[j + 4] & mask2->w64[0];
+      new_mask_i ^= mask_shares->shares[j + 5] & mask2->w64[1];
+      new_mask_i ^= mask_shares->shares[j + 6] & mask2->w64[2];
+      new_mask_i ^= mask_shares->shares[j + 7] & mask2->w64[3];
    }
    mask2_shares->shares[30 - 1 - i] = new_mask_i;
  }
  mzd_mul_v_parity_uint64_256_30(state2, state, matrix);
 }

-void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                    shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (128) / 8;
+void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* state,
+                                    shares_t* mask2_shares, shares_t* mask_shares,
+                                    const mzd_local_t* matrix) {
+  const uint32_t rowstride = (128) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -724,7 +728,7 @@ void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* stat
    const uint64_t mask_share = mask_shares->shares[30 - 1 - i];

    for (uint32_t j = 0; j < 128; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -745,9 +749,10 @@ void mpc_matrix_addmul_r_uint64_128(mzd_local_t* state2, const mzd_local_t* stat
  freeShares(tmp_mask);
 }

-void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                    shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = 256 / 8;
+void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* state,
+                                    shares_t* mask2_shares, shares_t* mask_shares,
+                                    const mzd_local_t* matrix) {
+  const uint32_t rowstride = 256 / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -755,7 +760,7 @@ void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* stat
    const uint64_t mask_share = mask_shares->shares[30 - 1 - i];

    for (uint32_t j = 0; j < 192; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -776,9 +781,10 @@ void mpc_matrix_addmul_r_uint64_192(mzd_local_t* state2, const mzd_local_t* stat
  freeShares(tmp_mask);
 }

-void mpc_matrix_addmul_r_uint64_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                    shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_addmul_r_uint64_256(mzd_local_t* state2, const mzd_local_t* state,
+                                    shares_t* mask2_shares, shares_t* mask_shares,
+                                    const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -786,7 +792,7 @@ void mpc_matrix_addmul_r_uint64_256(mzd_local_t* state2, const mzd_local_t* stat
    const uint64_t mask_share = mask_shares->shares[30 - 1 - i];

    for (uint32_t j = 0; j < 256; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -814,12 +820,12 @@ void mpc_matrix_mul_nl_part_uint64_128(mzd_local_t* nl_part, const mzd_local_t*
                                       const mzd_local_t* precomputed_constant_nl,
                                       shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 128; i++) {
    const uint64_t key_mask = key_masks->shares[128 - 1 - i];

    for (uint32_t j = 0; j < 20 * 32; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      const block_t* mask1 = &nl_part_block_masks[(matrix_byte >> 0) & 0xF];
      const block_t* mask2 = &nl_part_block_masks[(matrix_byte >> 4) & 0xF];
@ -843,12 +849,12 @@ void mpc_matrix_mul_nl_part_uint64_192(mzd_local_t* nl_part, const mzd_local_t*
                                       const mzd_local_t* precomputed_constant_nl,
                                       shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 192; i++) {
    const uint64_t key_mask = key_masks->shares[192 - 1 - i];

    for (uint32_t j = 0; j < 30 * 32; j += 8) {
-      uint8_t matrix_byte = ((const uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      const block_t* mask1 = &nl_part_block_masks[(matrix_byte >> 0) & 0xF];
      const block_t* mask2 = &nl_part_block_masks[(matrix_byte >> 4) & 0xF];
@ -872,12 +878,12 @@ void mpc_matrix_mul_nl_part_uint64_256(mzd_local_t* nl_part, const mzd_local_t*
                                       const mzd_local_t* precomputed_constant_nl,
                                       shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 256; i++) {
    const uint64_t key_mask = key_masks->shares[256 - 1 - i];

    for (uint32_t j = 0; j < 38 * 32; j += 8) {
-      uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      const block_t* mask1 = &nl_part_block_masks[(matrix_byte >> 0) & 0xF];
      const block_t* mask2 = &nl_part_block_masks[(matrix_byte >> 4) & 0xF];
@ -902,7 +908,7 @@ void mpc_matrix_mul_nl_part_uint64_256(mzd_local_t* nl_part, const mzd_local_t*
 ATTR_TARGET_S128
 void mpc_matrix_mul_s128_128(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
                             shares_t* mask_shares) {
-  const uint32_t rowstride = (128) / 8;
+  const uint32_t rowstride = (128) / (sizeof(word) * 8);
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 128; i++) {
@ -913,7 +919,7 @@ void mpc_matrix_mul_s128_128(mzd_local_t* output, const mzd_local_t* vec, const
    word128* tmp_mask_block = (word128*)tmp_mask->shares;

    for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
      mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -935,7 +941,7 @@ void mpc_matrix_mul_s128_128(mzd_local_t* output, const mzd_local_t* vec, const
 ATTR_TARGET_S128
 void mpc_matrix_mul_s128_192(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
                             shares_t* mask_shares) {
-  const uint32_t rowstride = (256) / 8;
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 192; i++) {
@ -946,7 +952,7 @@ void mpc_matrix_mul_s128_192(mzd_local_t* output, const mzd_local_t* vec, const
    word128* tmp_mask_block = (word128*)tmp_mask->shares;

    for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
      mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -968,7 +974,7 @@ void mpc_matrix_mul_s128_192(mzd_local_t* output, const mzd_local_t* vec, const
 ATTR_TARGET_S128
 void mpc_matrix_mul_s128_256(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
                             shares_t* mask_shares) {
-  const uint32_t rowstride = (256) / 8;
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 256; i++) {
@ -979,7 +985,7 @@ void mpc_matrix_mul_s128_256(mzd_local_t* output, const mzd_local_t* vec, const
    word128* tmp_mask_block = (word128*)tmp_mask->shares;

    for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
      mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1000,15 +1006,16 @@ void mpc_matrix_mul_s128_256(mzd_local_t* output, const mzd_local_t* vec, const

 #if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
 ATTR_TARGET_S128
-void mpc_matrix_mul_z_s128_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                               const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (128) / 8;
+void mpc_matrix_mul_z_s128_128(mzd_local_t* state2, const mzd_local_t* state,
+                               shares_t* mask2_shares, const shares_t* mask_shares,
+                               const mzd_local_t* matrix) {
+  const uint32_t rowstride = (128) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 128);
  for (size_t i = 0; i < 30; i++) {
    block_t new_mask_i      = {{0, 0, 0, 0}};
    word128* tmp_mask_block = (word128*)mask_shares->shares;
-    for (uint32_t j = 0; j < 128 / 8; j++, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (128 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 4) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (128 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1025,15 +1032,16 @@ void mpc_matrix_mul_z_s128_128(mzd_local_t* state2, const mzd_local_t* state, sh
 }

 ATTR_TARGET_S128
-void mpc_matrix_mul_z_s128_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                               const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_z_s128_192(mzd_local_t* state2, const mzd_local_t* state,
+                               shares_t* mask2_shares, const shares_t* mask_shares,
+                               const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 192);
  for (size_t i = 0; i < 30; i++) {
    block_t new_mask_i      = {{0, 0, 0, 0}};
    word128* tmp_mask_block = (word128*)mask_shares->shares;
-    for (uint32_t j = 0; j < 192 / 8; j++, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (192 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 4) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (192 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1050,15 +1058,16 @@ void mpc_matrix_mul_z_s128_192(mzd_local_t* state2, const mzd_local_t* state, sh
 }

 ATTR_TARGET_S128
-void mpc_matrix_mul_z_s128_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                               const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_z_s128_256(mzd_local_t* state2, const mzd_local_t* state,
+                               shares_t* mask2_shares, const shares_t* mask_shares,
+                               const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 256);
  for (size_t i = 0; i < 30; i++) {
    block_t new_mask_i      = {{0, 0, 0, 0}};
    word128* tmp_mask_block = (word128*)mask_shares->shares;
-    for (uint32_t j = 0; j < 256 / 8; j++, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (256 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 4) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (256 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1075,9 +1084,10 @@ void mpc_matrix_mul_z_s128_256(mzd_local_t* state2, const mzd_local_t* state, sh
 }

 ATTR_TARGET_S128
-void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                  shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (128) / 8;
+void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state,
+                                  shares_t* mask2_shares, shares_t* mask_shares,
+                                  const mzd_local_t* matrix) {
+  const uint32_t rowstride = (128) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -1088,7 +1098,7 @@ void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state,

    word128* tmp_mask_block = (word128*)tmp_mask->shares;
    for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
      mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1108,9 +1118,10 @@ void mpc_matrix_addmul_r_s128_128(mzd_local_t* state2, const mzd_local_t* state,
 }

 ATTR_TARGET_S128
-void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                  shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state,
+                                  shares_t* mask2_shares, shares_t* mask_shares,
+                                  const mzd_local_t* matrix) {
+  const uint32_t rowstride = 256 / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -1121,7 +1132,7 @@ void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state,

    word128* tmp_mask_block = (word128*)tmp_mask->shares;
    for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix)[(i * rowstride) + (192 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
      mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1141,9 +1152,10 @@ void mpc_matrix_addmul_r_s128_192(mzd_local_t* state2, const mzd_local_t* state,
 }

 ATTR_TARGET_S128
-void mpc_matrix_addmul_r_s128_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                  shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_addmul_r_s128_256(mzd_local_t* state2, const mzd_local_t* state,
+                                  shares_t* mask2_shares, shares_t* mask_shares,
+                                  const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -1154,7 +1166,7 @@ void mpc_matrix_addmul_r_s128_256(mzd_local_t* state2, const mzd_local_t* state,

    word128* tmp_mask_block = (word128*)tmp_mask->shares;
    for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix)[(i * rowstride) + (256 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w128[0];
      mask2 = block_masks[(matrix_byte >> 4) & 0xf].w128[1];
@ -1181,7 +1193,7 @@ void mpc_matrix_mul_nl_part_s128_128(mzd_local_t* nl_part, const mzd_local_t* ke
                                     const mzd_local_t* precomputed_constant_nl,
                                     shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 128; i++) {
    const uint64_t key_mask   = key_masks->shares[128 - 1 - i];
    const block_t mask_share2 = {{key_mask, key_mask, key_mask, key_mask}};
@ -1190,7 +1202,7 @@ void mpc_matrix_mul_nl_part_s128_128(mzd_local_t* nl_part, const mzd_local_t* ke
    word128* tmp_mask_block = (word128*)nl_part_masks->shares;

    for (uint32_t j = 0; j < 20 * 32; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[0];
      mask2 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[1];
@ -1213,7 +1225,7 @@ void mpc_matrix_mul_nl_part_s128_192(mzd_local_t* nl_part, const mzd_local_t* ke
                                     const mzd_local_t* precomputed_constant_nl,
                                     shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 192; i++) {
    const uint64_t key_mask   = key_masks->shares[192 - 1 - i];
    const block_t mask_share2 = {{key_mask, key_mask, key_mask, key_mask}};
@ -1222,7 +1234,7 @@ void mpc_matrix_mul_nl_part_s128_192(mzd_local_t* nl_part, const mzd_local_t* ke
    word128* tmp_mask_block = (word128*)nl_part_masks->shares;

    for (uint32_t j = 0; j < 30 * 32; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[0];
      mask2 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[1];
@ -1245,7 +1257,7 @@ void mpc_matrix_mul_nl_part_s128_256(mzd_local_t* nl_part, const mzd_local_t* ke
                                     const mzd_local_t* precomputed_constant_nl,
                                     shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 256; i++) {
    const uint64_t key_mask   = key_masks->shares[256 - 1 - i];
    const block_t mask_share2 = {{key_mask, key_mask, key_mask, key_mask}};
@ -1254,7 +1266,7 @@ void mpc_matrix_mul_nl_part_s128_256(mzd_local_t* nl_part, const mzd_local_t* ke
    word128* tmp_mask_block = (word128*)nl_part_masks->shares;

    for (uint32_t j = 0; j < 38 * 32; j += 8, tmp_mask_block += 4) {
-      uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[0];
      mask2 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w128[1];
@ -1277,7 +1289,7 @@ void mpc_matrix_mul_nl_part_s128_256(mzd_local_t* nl_part, const mzd_local_t* ke
 ATTR_TARGET_AVX2
 void mpc_matrix_mul_s256_128(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
                             shares_t* mask_shares) {
-  const uint32_t rowstride = (128) / 8;
+  const uint32_t rowstride = (128) / (sizeof(word) * 8);
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 128; i++) {
@ -1288,9 +1300,9 @@ void mpc_matrix_mul_s256_128(mzd_local_t* output, const mzd_local_t* vec, const
    word256* tmp_mask_block = (word256*)tmp_mask->shares;

    for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix)[(i * rowstride) + (128 - 1 - j) / 8];
-      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
-      mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));
+      mask1               = block_masks[(matrix_byte >> 4) & 0xf].w256;
+      mask2               = block_masks[(matrix_byte >> 0) & 0xf].w256;

      tmp_mask_block[0] = mm256_xor_mask(tmp_mask_block[0], mask_share2, mask1);
      tmp_mask_block[1] = mm256_xor_mask(tmp_mask_block[1], mask_share2, mask2);
@ -1305,7 +1317,7 @@ void mpc_matrix_mul_s256_128(mzd_local_t* output, const mzd_local_t* vec, const
 ATTR_TARGET_AVX2
 void mpc_matrix_mul_s256_192(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
                             shares_t* mask_shares) {
-  const uint32_t rowstride = (256) / 8;
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 192; i++) {
@ -1316,9 +1328,9 @@ void mpc_matrix_mul_s256_192(mzd_local_t* output, const mzd_local_t* vec, const
    word256* tmp_mask_block = (word256*)tmp_mask->shares;

    for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
-      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
-      mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));
+      mask1               = block_masks[(matrix_byte >> 4) & 0xf].w256;
+      mask2               = block_masks[(matrix_byte >> 0) & 0xf].w256;

      tmp_mask_block[0] = mm256_xor_mask(tmp_mask_block[0], mask_share2, mask1);
      tmp_mask_block[1] = mm256_xor_mask(tmp_mask_block[1], mask_share2, mask2);
@ -1333,7 +1345,7 @@ void mpc_matrix_mul_s256_192(mzd_local_t* output, const mzd_local_t* vec, const
 ATTR_TARGET_AVX2
 void mpc_matrix_mul_s256_256(mzd_local_t* output, const mzd_local_t* vec, const mzd_local_t* matrix,
                             shares_t* mask_shares) {
-  const uint32_t rowstride = (256) / 8;
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);

  for (size_t i = 0; i < 256; i++) {
@ -1344,9 +1356,9 @@ void mpc_matrix_mul_s256_256(mzd_local_t* output, const mzd_local_t* vec, const
    word256* tmp_mask_block = (word256*)tmp_mask->shares;

    for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
-      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
-      mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));
+      mask1               = block_masks[(matrix_byte >> 4) & 0xf].w256;
+      mask2               = block_masks[(matrix_byte >> 0) & 0xf].w256;

      tmp_mask_block[0] = mm256_xor_mask(tmp_mask_block[0], mask_share2, mask1);
      tmp_mask_block[1] = mm256_xor_mask(tmp_mask_block[1], mask_share2, mask2);
@ -1360,15 +1372,16 @@ void mpc_matrix_mul_s256_256(mzd_local_t* output, const mzd_local_t* vec, const

 #if defined(OPTIMIZED_LINEAR_LAYER_EVALUATION)
 ATTR_TARGET_AVX2
-void mpc_matrix_mul_z_s256_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                               const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (128) / 8;
+void mpc_matrix_mul_z_s256_128(mzd_local_t* state2, const mzd_local_t* state,
+                               shares_t* mask2_shares, const shares_t* mask_shares,
+                               const mzd_local_t* matrix) {
+  const uint32_t rowstride = (128) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 128);
  for (size_t i = 0; i < 30; i++) {
    block_t new_mask_i      = {{0, 0, 0, 0}};
    word256* tmp_mask_block = (word256*)mask_shares->shares;
-    for (uint32_t j = 0; j < 128 / 8; j++, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (128 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 2) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (128 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1383,15 +1396,16 @@ void mpc_matrix_mul_z_s256_128(mzd_local_t* state2, const mzd_local_t* state, sh
 }

 ATTR_TARGET_AVX2
-void mpc_matrix_mul_z_s256_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                               const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_z_s256_192(mzd_local_t* state2, const mzd_local_t* state,
+                               shares_t* mask2_shares, const shares_t* mask_shares,
+                               const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 192);
  for (size_t i = 0; i < 30; i++) {
    block_t new_mask_i      = {{0, 0, 0, 0}};
    word256* tmp_mask_block = (word256*)mask_shares->shares;
-    for (uint32_t j = 0; j < 192 / 8; j++, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (192 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 2) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (192 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1406,15 +1420,16 @@ void mpc_matrix_mul_z_s256_192(mzd_local_t* state2, const mzd_local_t* state, sh
 }

 ATTR_TARGET_AVX2
-void mpc_matrix_mul_z_s256_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                               const shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_mul_z_s256_256(mzd_local_t* state2, const mzd_local_t* state,
+                               shares_t* mask2_shares, const shares_t* mask_shares,
+                               const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  memset(mask2_shares->shares, 0, sizeof(uint64_t) * 256);
  for (size_t i = 0; i < 30; i++) {
    block_t new_mask_i      = {{0, 0, 0, 0}};
    word256* tmp_mask_block = (word256*)mask_shares->shares;
-    for (uint32_t j = 0; j < 256 / 8; j++, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[i * rowstride + (256 / 8) - 1 - j];
+    for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 2) {
+      uint8_t matrix_byte = matrix->w64[i * rowstride + (256 - 1 - j) / 64] >> (56 - (j % 64));

      const block_t* mask1 = &block_masks[(matrix_byte >> 4) & 0xF];
      const block_t* mask2 = &block_masks[(matrix_byte >> 0) & 0xF];
@ -1429,10 +1444,11 @@ void mpc_matrix_mul_z_s256_256(mzd_local_t* state2, const mzd_local_t* state, sh
 }

 ATTR_TARGET_AVX2
-void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                  shares_t* mask_shares, const mzd_local_t* matrix) {
+void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state,
+                                  shares_t* mask2_shares, shares_t* mask_shares,
+                                  const mzd_local_t* matrix) {

-  const uint32_t rowstride = (128) / 8;
+  const uint32_t rowstride = (128) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -1443,7 +1459,7 @@ void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state,

    word256* tmp_mask_block = (word256*)tmp_mask->shares;
    for (uint32_t j = 0; j < 128; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (128 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (128 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
      mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
@ -1458,9 +1474,10 @@ void mpc_matrix_addmul_r_s256_128(mzd_local_t* state2, const mzd_local_t* state,
 }

 ATTR_TARGET_AVX2
-void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                  shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state,
+                                  shares_t* mask2_shares, shares_t* mask_shares,
+                                  const mzd_local_t* matrix) {
+  const uint32_t rowstride = 256 / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -1471,7 +1488,7 @@ void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state,

    word256* tmp_mask_block = (word256*)tmp_mask->shares;
    for (uint32_t j = 0; j < 192; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (192 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (192 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
      mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
@ -1487,9 +1504,10 @@ void mpc_matrix_addmul_r_s256_192(mzd_local_t* state2, const mzd_local_t* state,
 }

 ATTR_TARGET_AVX2
-void mpc_matrix_addmul_r_s256_256(mzd_local_t* state2, const mzd_local_t* state, shares_t* mask2_shares,
-                                  shares_t* mask_shares, const mzd_local_t* matrix) {
-  const uint32_t rowstride = (256) / 8;
+void mpc_matrix_addmul_r_s256_256(mzd_local_t* state2, const mzd_local_t* state,
+                                  shares_t* mask2_shares, shares_t* mask_shares,
+                                  const mzd_local_t* matrix) {
+  const uint32_t rowstride = (256) / (8 * sizeof(word));
  shares_t* tmp_mask       = allocateShares(mask_shares->numWords);
  copyShares(tmp_mask, mask2_shares);

@ -1500,7 +1518,7 @@ void mpc_matrix_addmul_r_s256_256(mzd_local_t* state2, const mzd_local_t* state,

    word256* tmp_mask_block = (word256*)tmp_mask->shares;
    for (uint32_t j = 0; j < 256; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((const uint8_t*)matrix->w64)[(i * rowstride) + (256 - 1 - j) / 8];
+      uint8_t matrix_byte = matrix->w64[(i * rowstride) + (256 - 1 - j) / 64] >> (56 - (j % 64));

      mask1 = block_masks[(matrix_byte >> 4) & 0xf].w256;
      mask2 = block_masks[(matrix_byte >> 0) & 0xf].w256;
@ -1523,7 +1541,7 @@ void mpc_matrix_mul_nl_part_s256_128(mzd_local_t* nl_part, const mzd_local_t* ke
                                     const mzd_local_t* precomputed_constant_nl,
                                     shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((20 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 128; i++) {
    const uint64_t key_mask   = key_masks->shares[128 - 1 - i];
    const word256 mask_share2 = _mm256_set1_epi64x(key_mask);
@ -1532,7 +1550,7 @@ void mpc_matrix_mul_nl_part_s256_128(mzd_local_t* nl_part, const mzd_local_t* ke
    word256* tmp_mask_block = (word256*)nl_part_masks->shares;

    for (uint32_t j = 0; j < 20 * 32; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w256;
      mask2 = nl_part_block_masks[(matrix_byte >> 4) & 0xf].w256;
@ -1551,7 +1569,7 @@ void mpc_matrix_mul_nl_part_s256_192(mzd_local_t* nl_part, const mzd_local_t* ke
                                     const mzd_local_t* precomputed_constant_nl,
                                     shares_t* nl_part_masks, const shares_t* key_masks) {

-  const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((30 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 192; i++) {
    const uint64_t key_mask   = key_masks->shares[192 - 1 - i];
    const word256 mask_share2 = _mm256_set1_epi64x(key_mask);
@ -1560,7 +1578,7 @@ void mpc_matrix_mul_nl_part_s256_192(mzd_local_t* nl_part, const mzd_local_t* ke
    word256* tmp_mask_block = (word256*)nl_part_masks->shares;

    for (uint32_t j = 0; j < 30 * 32; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w256;
      mask2 = nl_part_block_masks[(matrix_byte >> 4) & 0xf].w256;
@ -1578,7 +1596,7 @@ void mpc_matrix_mul_nl_part_s256_256(mzd_local_t* nl_part, const mzd_local_t* ke
                                     const mzd_local_t* precomputed_nl_matrix,
                                     const mzd_local_t* precomputed_constant_nl,
                                     shares_t* nl_part_masks, const shares_t* key_masks) {
-  const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / 8;
+  const uint32_t rowstride = ((38 * 32 + 255) / 256 * 256) / (8 * sizeof(word));
  for (size_t i = 0; i < 256; i++) {
    const uint64_t key_mask   = key_masks->shares[256 - 1 - i];
    const word256 mask_share2 = _mm256_set1_epi64x(key_mask);
@ -1587,7 +1605,7 @@ void mpc_matrix_mul_nl_part_s256_256(mzd_local_t* nl_part, const mzd_local_t* ke
    word256* tmp_mask_block = (word256*)nl_part_masks->shares;

    for (uint32_t j = 0; j < 38 * 32; j += 8, tmp_mask_block += 2) {
-      uint8_t matrix_byte = ((uint8_t*)precomputed_nl_matrix->w64)[i * rowstride + j / 8];
+      uint8_t matrix_byte = precomputed_nl_matrix->w64[(i * rowstride) + j / 64] >> (j % 64);

      mask1 = nl_part_block_masks[(matrix_byte >> 0) & 0xf].w256;
      mask2 = nl_part_block_masks[(matrix_byte >> 4) & 0xf].w256;
--- a/src/sig/picnic/external/picnic_impl.c
+++ b/src/sig/picnic/external/picnic_impl.c
@ -293,14 +293,6 @@ static uint64_t uint64_from_bitstream_10(bitstream_t* bs) {
  return ((uint64_t)bitstream_get_bits_32(bs, 30)) << (64 - 30);
 }

-static void uint64_to_bitstream_1(bitstream_t* bs, const uint64_t v) {
-  bitstream_put_bits_8(bs, v >> (64 - 3), 3);
-}
-
-static uint64_t uint64_from_bitstream_1(bitstream_t* bs) {
-  return ((uint64_t)bitstream_get_bits_8(bs, 3)) << (64 - 3);
-}
-
 static void compress_view(uint8_t* dst, const picnic_instance_t* pp, const view_t* views,
                          const unsigned int idx) {
  const size_t num_views = pp->lowmc->r;
@ -310,14 +302,8 @@ static void compress_view(uint8_t* dst, const picnic_instance_t* pp, const view_
  bs.position = 0;

  const view_t* v = &views[0];
-  if (pp->lowmc->m == 10) {
-    for (size_t i = 0; i < num_views; ++i, ++v) {
-      uint64_to_bitstream_10(&bs, v->t[idx]);
-    }
-  } else if (pp->lowmc->m == 1) {
-    for (size_t i = 0; i < num_views; ++i, ++v) {
-      uint64_to_bitstream_1(&bs, v->t[idx]);
-    }
+  for (size_t i = 0; i < num_views; ++i, ++v) {
+    uint64_to_bitstream_10(&bs, v->t[idx]);
  }
 }

@ -330,14 +316,8 @@ static void decompress_view(view_t* views, const picnic_instance_t* pp, const ui
  bs.position = 0;

  view_t* v = &views[0];
-  if (pp->lowmc->m == 10) {
-    for (size_t i = 0; i < num_views; ++i, ++v) {
-      v->t[idx] = uint64_from_bitstream_10(&bs);
-    }
-  } else if (pp->lowmc->m == 1) {
-    for (size_t i = 0; i < num_views; ++i, ++v) {
-      v->t[idx] = uint64_from_bitstream_1(&bs);
-    }
+  for (size_t i = 0; i < num_views; ++i, ++v) {
+    v->t[idx] = uint64_from_bitstream_10(&bs);
  }
 }

@ -350,15 +330,8 @@ static void decompress_random_tape(rvec_t* rvec, const picnic_instance_t* pp, co
  bs.position = 0;

  rvec_t* rv = &rvec[0];
-
-  if (pp->lowmc->m == 10) {
-    for (size_t i = 0; i < num_views; ++i, ++rv) {
-      rv->t[idx] = uint64_from_bitstream_10(&bs);
-    }
-  } else if (pp->lowmc->m == 1) {
-    for (size_t i = 0; i < num_views; ++i, ++rv) {
-      rv->t[idx] = uint64_from_bitstream_1(&bs);
-    }
+  for (size_t i = 0; i < num_views; ++i, ++rv) {
+    rv->t[idx] = uint64_from_bitstream_10(&bs);
  }
 }

--- a/src/sig/picnic/external/picnic_instances.c
+++ b/src/sig/picnic/external/picnic_instances.c
@ -43,26 +43,6 @@ const uint8_t HASH_PREFIX_5 = 5;
 #define LOWMC_L5_OR_NULL NULL
 #endif

-// L1, L3, and L5 lowmc instances with 1 SBOX
-#if defined(WITH_LOWMC_128_128_182)
-#include "lowmc_128_128_182.h"
-#define LOWMC_L1_1_OR_NULL &lowmc_128_128_182
-#else
-#define LOWMC_L1_1_OR_NULL NULL
-#endif
-#if defined(WITH_LOWMC_192_192_284)
-#include "lowmc_192_192_284.h"
-#define LOWMC_L3_1_OR_NULL &lowmc_192_192_284
-#else
-#define LOWMC_L3_1_OR_NULL NULL
-#endif
-#if defined(WITH_LOWMC_256_256_363)
-#include "lowmc_256_256_363.h"
-#define LOWMC_L5_1_OR_NULL &lowmc_256_256_363
-#else
-#define LOWMC_L5_1_OR_NULL NULL
-#endif
-
 #if defined(WITH_ZKBPP)
 #define ENABLE_ZKBPP(x) x
 #else
@ -81,9 +61,11 @@ const uint8_t HASH_PREFIX_5 = 5;
 #elif defined(WITH_ZKBPP)
 #define NULL_FNS                                                                                   \
  { NULL, NULL, NULL, NULL, NULL }
-#else
+#elif defined(WITH_KKW)
 #define NULL_FNS                                                                                   \
-  { NULL }
+  { NULL, NULL, NULL }
+#else
+#error "At least one of WITH_ZKBPP and WITH_KKW have to be defined!"
 #endif

 static picnic_instance_t instances[PARAMETER_SET_MAX_INDEX] = {
@ -107,19 +89,7 @@ static picnic_instance_t instances[PARAMETER_SET_MAX_INDEX] = {
     PICNIC_SIGNATURE_SIZE_Picnic2_L3_FS, Picnic2_L3_FS, TRANSFORM_FS, NULL_FNS},
    {ENABLE_KKW(LOWMC_L5_OR_NULL), 64, 32, 803, 50, 64, 32, 32, 143, 30, 110, 0, 0,
     PICNIC_SIGNATURE_SIZE_Picnic2_L5_FS, Picnic2_L5_FS, TRANSFORM_FS, NULL_FNS},
-    // Picnic with LowMC with m=1
-    {ENABLE_ZKBPP(LOWMC_L1_1_OR_NULL), 32, 16, 219, 219, 3, 16, 16, 69, 3, 55, 0, 0,
-     PICNIC_SIGNATURE_SIZE_Picnic_L1_1_FS, Picnic_L1_1_FS, TRANSFORM_FS, NULL_FNS},
-    {ENABLE_ZKBPP(LOWMC_L1_1_OR_NULL), 32, 16, 219, 219, 3, 16, 16, 69, 3, 55, 87, 103,
-     PICNIC_SIGNATURE_SIZE_Picnic_L1_1_UR, Picnic_L1_1_UR, TRANSFORM_UR, NULL_FNS},
-    {ENABLE_ZKBPP(LOWMC_L3_1_OR_NULL), 48, 24, 329, 329, 3, 24, 24, 107, 3, 83, 0, 0,
-     PICNIC_SIGNATURE_SIZE_Picnic_L3_1_FS, Picnic_L3_1_FS, TRANSFORM_FS, NULL_FNS},
-    {ENABLE_ZKBPP(LOWMC_L3_1_OR_NULL), 48, 24, 329, 329, 3, 24, 24, 107, 3, 83, 131, 155,
-     PICNIC_SIGNATURE_SIZE_Picnic_L3_1_UR, Picnic_L3_1_UR, TRANSFORM_UR, NULL_FNS},
-    {ENABLE_ZKBPP(LOWMC_L5_1_OR_NULL), 64, 32, 438, 438, 3, 32, 32, 137, 3, 110, 0, 0,
-     PICNIC_SIGNATURE_SIZE_Picnic_L5_1_FS, Picnic_L5_1_FS, TRANSFORM_FS, NULL_FNS},
-    {ENABLE_ZKBPP(LOWMC_L5_1_OR_NULL), 64, 32, 438, 438, 3, 32, 32, 137, 3, 110, 169, 201,
-     PICNIC_SIGNATURE_SIZE_Picnic_L5_1_UR, Picnic_L5_1_UR, TRANSFORM_UR, NULL_FNS}};
+};
 static bool instance_initialized[PARAMETER_SET_MAX_INDEX];

 static bool create_instance(picnic_instance_t* pp) {
--- a/src/sig/picnic/external/picnic_instances.h
+++ b/src/sig/picnic/external/picnic_instances.h
@ -22,8 +22,6 @@
 #include "picnic.h"

 #define SALT_SIZE 32
-#define MAX_LOWMC_ROUNDS 38
-#define MAX_LOWMC_SBOXES 10
 #define MAX_DIGEST_SIZE 64
 #define MAX_NUM_ROUNDS 438
 #define MAX_VIEW_SIZE 143
--- a/src/sig/picnic/external/sha3/s390_cpacf.h
+++ b/src/sig/picnic/external/sha3/s390_cpacf.h
@ -0,0 +1,159 @@
+/*
+ *  This file is part of the optimized implementation of the Picnic signature scheme.
+ *  See the accompanying documentation for complete details.
+ *
+ *  The code is provided under the MIT license, see LICENSE for
+ *  more details.
+ *  SPDX-License-Identifier: MIT
+ */
+
+#ifndef SHA3_S390_CPACF_H
+#define SHA3_S390_CPACF_H
+
+#include <string.h>
+#include <stdint.h>
+
+#include "macros.h"
+
+typedef struct {
+  uint8_t ctx[200]; /* param block */
+  uint8_t data_block[168]; /* buffer for input and output data */
+  uint8_t func; /* function: SHAKE128 or SHKAE256 */
+  uint8_t data_block_size; /* block size */
+  uint8_t pos; /* current position in data_block */
+} hash_context ATTR_ALIGNED(32);
+
+static inline void hash_init(hash_context* ctx, size_t digest_size) {
+  memset(ctx->ctx, 0, sizeof(ctx->ctx));
+  memset(ctx->data_block, 0, sizeof(ctx->data_block));
+  if (digest_size == 32) {
+    /* SHAKE128 */
+    ctx->func = 0x24;
+    ctx->data_block_size = 168;
+  } else {
+    /* SHAKE256 */
+    ctx->func = 0x25;
+    ctx->data_block_size = 136;
+  }
+  ctx->pos = 0;
+}
+
+/**
+ * Perform KIMD instruction (hash multiple blocks of 168 (SHAKE128) or 136 (SHAKE256) bytes
+ */
+static inline void hash_update_kimd(hash_context* ctx, const uint8_t* data, size_t size) {
+  /* function code in GR 0 */
+  register long func asm("0") = ctx->func;
+  /* param block in GR 1 */
+  register uint8_t* param asm("1") = ctx->ctx;
+  /* input data in an even numbered GR (goes into R2) */
+  register const uint8_t* src asm("2") = data;
+  /* size of input data (goes into R2+1); needs to be a multiple of the data block size */
+  register size_t src_size asm("3") = size;
+
+  asm volatile("0:  .insn rre,0xb93e0000,0,%[src]\n\t" /* KIMD opcode */
+               "    brc   1,0b\n\t"                    /* handle partial completion */
+               : [src] "+a"(src), "+d"(src_size)
+               : "d"(func), "a"(param)
+               : "cc", "memory");
+}
+
+/**
+ * Perform KLMD instruction (hash and pad the last block of < 168 (SHAKE128) or 136 (SHAKE256) bytes
+ * and produce XOF output of a block size)
+ */
+static inline void hash_update_klmd(hash_context* ctx, uint8_t* buffer, size_t buffer_size, const uint8_t* data, size_t size) {
+  /* function code in GR 0 */
+  register long func asm("0") = ctx->func;
+  /* param block in GR 1 */
+  register uint8_t* param asm("1") = ctx->ctx;
+  /* input data in an even numbered GR (goes into R2) */
+  register const uint8_t* src asm("2") = data;
+  /* size of input data (goes into R2+1) */
+  register size_t src_size asm("3") = size;
+  /* output buffer in an even numbered GR (goes into R1) */
+  register unsigned char* dst asm("4") = buffer;
+  /* size of output buffer (goes into R1+1); needs to be a multiple of the data block size */
+  register long dst_size asm("5") = buffer_size;
+
+  asm volatile("0:  .insn rre,0xb93f0000,%[dst],%[src]\n\t" /* KLMD opcode */
+               "    brc   1,0b\n\t"                         /* handle partial completion */
+               : [src] "+a"(src), "+d"(src_size), [dst] "+a"(dst), "+d"(dst_size)
+               : "d"(func), "a"(param)
+               : "cc", "memory");
+}
+
+/**
+ * Perform KLMD instruction (produce XOF output of a block size)
+ */
+static inline void hash_squeeze_kmld(hash_context* ctx, uint8_t* buffer, size_t size) {
+  hash_update_klmd(ctx, buffer, size, NULL, 0);
+}
+
+static inline void hash_update(hash_context* ctx, const uint8_t* data, size_t size) {
+  /* process buffered data */
+  if (ctx->pos) {
+    const size_t gap       = ctx->data_block_size - ctx->pos;
+    const size_t copy_size = MIN(gap, size);
+
+    memcpy(ctx->data_block + ctx->pos, data, copy_size);
+    ctx->pos += copy_size;
+    data += copy_size;
+    size -= copy_size;
+
+    if (ctx->pos == ctx->data_block_size) {
+      hash_update_kimd(ctx, ctx->data_block, ctx->data_block_size);
+      ctx->pos = 0;
+    }
+  }
+
+  /* process as many full blocks as possible */
+  if (size > ctx->data_block_size) {
+    const size_t copy_size = size - (size % ctx->data_block_size);
+    hash_update_kimd(ctx, data, copy_size);
+    data += copy_size;
+    size -= copy_size;
+  }
+
+  /* buffer remaining data */
+  if (size) {
+    memcpy(ctx->data_block, data, size);
+    ctx->pos = size;
+  }
+}
+
+static inline void hash_final(hash_context* ctx) {
+  /* process remaining input (if available) */
+  hash_update_klmd(ctx, ctx->data_block, ctx->data_block_size, ctx->data_block, ctx->pos);
+  ctx->pos = 0;
+}
+
+static inline void hash_squeeze(hash_context* ctx, uint8_t* buffer, size_t buflen) {
+  /* process buffered output */
+  if (ctx->pos < ctx->data_block_size) {
+    const size_t gap       = ctx->data_block_size - ctx->pos;
+    const size_t copy_size = MIN(gap, buflen);
+
+    memcpy(buffer, ctx->data_block + ctx->pos, copy_size);
+    ctx->pos += copy_size;
+    buffer += copy_size;
+    buflen -= copy_size;
+  }
+  /* either ctx->pos == ctx->data_block_size or buflen == 0 */
+
+  /* process as many full blocks as possible */
+  if (buflen > ctx->data_block_size) {
+    const size_t copy_size = buflen - (buflen % ctx->data_block_size);
+    hash_squeeze_kmld(ctx, buffer, buflen);
+    buffer += copy_size;
+    buflen -= copy_size;
+  }
+
+  if (buflen) {
+    hash_squeeze_kmld(ctx, ctx->data_block, ctx->data_block_size);
+    memcpy(buffer, ctx->data_block, buflen);
+    ctx->pos = buflen;
+  }
+}
+
+#endif
--- a/src/sig/picnic/external/simd.h
+++ b/src/sig/picnic/external/simd.h
@ -28,10 +28,14 @@

 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
 #if defined(BUILTIN_CPU_SUPPORTED)
+#if !defined(BUILTIN_CPU_SUPPORTED_BROKEN_BMI2)
 #define CPU_SUPPORTS_AVX2 (__builtin_cpu_supports("avx2") && __builtin_cpu_supports("bmi2"))
+#else
+#define CPU_SUPPORTS_AVX2 (__builtin_cpu_supports("avx2") && cpu_supports(CPU_CAP_BMI2))
+#endif
 #define CPU_SUPPORTS_POPCNT __builtin_cpu_supports("popcnt")
 #else
-#define CPU_SUPPORTS_AVX2 cpu_supports(CPU_CAP_AVX2)
+#define CPU_SUPPORTS_AVX2 cpu_supports(CPU_CAP_AVX2 | CPU_CAP_BMI2)
 #define CPU_SUPPORTS_POPCNT cpu_supports(CPU_CAP_POPCNT)
 #endif
 #endif
@ -39,6 +43,9 @@
 #if defined(__x86_64__) || defined(_M_X64)
 // X86-64 CPUs always support SSE2
 #define CPU_SUPPORTS_SSE2 1
+#if defined(WITH_SSE2) || defined(WITH_AVX2)
+#define NO_UINT64_FALLBACK
+#endif
 #elif defined(__i386__) || defined(_M_IX86)
 #if defined(BUILTIN_CPU_SUPPORTED)
 #define CPU_SUPPORTS_SSE2 __builtin_cpu_supports("sse2")
@ -51,6 +58,9 @@

 #if defined(__aarch64__)
 #define CPU_SUPPORTS_NEON 1
+#if defined(WITH_NEON)
+#define NO_UINT64_FALLBACK
+#endif
 #elif defined(__arm__)
 #define CPU_SUPPRTS_NEON cpu_supports(CPU_CAP_NEON)
 #else
--- a/src/sig/picnic/sig_picnic.c
+++ b/src/sig/picnic/sig_picnic.c
@ -114,7 +114,7 @@ OQS_SIG *OQS_SIG_picnic_L1_FS_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic_L1_FS;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 1;
 	sig->euf_cma = true;
@ -153,7 +153,7 @@ OQS_SIG *OQS_SIG_picnic_L1_UR_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic_L1_UR;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 1;
 	sig->euf_cma = true;
@ -192,7 +192,7 @@ OQS_SIG *OQS_SIG_picnic_L3_FS_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic_L3_FS;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 3;
 	sig->euf_cma = true;
@ -231,7 +231,7 @@ OQS_SIG *OQS_SIG_picnic_L3_UR_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic_L3_UR;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 3;
 	sig->euf_cma = true;
@ -270,7 +270,7 @@ OQS_SIG *OQS_SIG_picnic_L5_FS_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic_L5_FS;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 5;
 	sig->euf_cma = true;
@ -310,7 +310,7 @@ OQS_SIG *OQS_SIG_picnic_L5_UR_new() {
 	}

 	sig->method_name = OQS_SIG_alg_picnic_L5_UR;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 5;
 	sig->euf_cma = true;
@ -347,7 +347,7 @@ OQS_SIG *OQS_SIG_picnic2_L1_FS_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic2_L1_FS;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 1;
 	sig->euf_cma = true;
@ -385,7 +385,7 @@ OQS_SIG *OQS_SIG_picnic2_L3_FS_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic2_L3_FS;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 3;
 	sig->euf_cma = true;
@ -423,7 +423,7 @@ OQS_SIG *OQS_SIG_picnic2_L5_FS_new() {
 		return NULL;
 	}
 	sig->method_name = OQS_SIG_alg_picnic2_L5_FS;
-	sig->alg_version = "https://github.com/IAIK/Picnic/tree/v2.1.2";
+	sig->alg_version = "https://github.com/IAIK/Picnic/commit/9917e33194d0b540c09706c68fb707c4912edeed";

 	sig->claimed_nist_level = 5;
 	sig->euf_cma = true;