From 30fbfd826fd26ff4cdda378a3a79d4e3dfbfd63a Mon Sep 17 00:00:00 2001
From: Douglas Stebila <dstebila@users.noreply.github.com>
Date: Sun, 1 Aug 2021 17:02:22 -0400
Subject: [PATCH] Sync with PQClean (#1061)

* Sync with PQClean 6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2

* Update documentation YML from PQClean

* Update documentation from PQClean
---
 docs/algorithms/kem/classic_mceliece.md       |   2 +-
 docs/algorithms/kem/classic_mceliece.yml      |   3 +-
 docs/algorithms/kem/hqc.md                    |   2 +-
 docs/algorithms/kem/hqc.yml                   |   2 +-
 docs/algorithms/kem/ntru.md                   |   2 +-
 docs/algorithms/kem/ntru.yml                  |   2 +-
 docs/algorithms/kem/ntruprime.md              |   2 +-
 docs/algorithms/kem/ntruprime.yml             |   2 +-
 docs/algorithms/kem/saber.md                  |   2 +-
 docs/algorithms/kem/saber.yml                 |   2 +-
 docs/algorithms/sig/falcon.md                 |   2 +-
 docs/algorithms/sig/falcon.yml                |   2 +-
 docs/algorithms/sig/rainbow.md                |   2 +-
 docs/algorithms/sig/rainbow.yml               |   2 +-
 docs/algorithms/sig/sphincs.md                |   2 +-
 docs/algorithms/sig/sphincs.yml               |   2 +-
 .../copy_from_upstream/copy_from_upstream.yml |   2 +-
 src/sig/falcon/CMakeLists.txt                 |   4 +-
 .../falcon/pqclean_falcon-1024_avx2/sign.c    |   4 +-
 .../falcon/pqclean_falcon-1024_clean/fpr.c    | 260 +++++++++++++++-
 .../falcon/pqclean_falcon-1024_clean/fpr.h    | 280 ++----------------
 .../falcon/pqclean_falcon-1024_clean/inner.c  |  70 +++++
 .../falcon/pqclean_falcon-1024_clean/inner.h  |  52 +---
 .../falcon/pqclean_falcon-1024_clean/sign.c   |   4 +-
 src/sig/falcon/pqclean_falcon-512_avx2/sign.c |   4 +-
 src/sig/falcon/pqclean_falcon-512_clean/fpr.c | 260 +++++++++++++++-
 src/sig/falcon/pqclean_falcon-512_clean/fpr.h | 280 ++----------------
 .../falcon/pqclean_falcon-512_clean/inner.c   |  70 +++++
 .../falcon/pqclean_falcon-512_clean/inner.h   |  49 +--
 .../falcon/pqclean_falcon-512_clean/sign.c    |   4 +-
 30 files changed, 756 insertions(+), 620 deletions(-)
 create mode 100755 src/sig/falcon/pqclean_falcon-1024_clean/inner.c
 create mode 100755 src/sig/falcon/pqclean_falcon-512_clean/inner.c

diff --git a/docs/algorithms/kem/classic_mceliece.md b/docs/algorithms/kem/classic_mceliece.md
index eda912836..2f7c0842f 100644
--- a/docs/algorithms/kem/classic_mceliece.md
+++ b/docs/algorithms/kem/classic_mceliece.md
@@ -5,7 +5,7 @@
 - **Principal submitters**: Daniel J. Bernstein, Tung Chou, Tanja Lange, Ingo von Maurich, Rafael Misoczki, Ruben Niederhagen, Edoardo Persichetti, Christiane Peters, Peter Schwabe, Nicolas Sendrier, Jakub Szefer, Wen Wang.
 - **Authors' website**: https://classic.mceliece.org
 - **Specification version**: SUPERCOP-20191221.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - SUPERCOP-20191221 "vec" and "avx" implementations
 - **Implementation license (SPDX-Identifier)**: Public domain.
 
diff --git a/docs/algorithms/kem/classic_mceliece.yml b/docs/algorithms/kem/classic_mceliece.yml
index e03e353d9..9efe580f7 100644
--- a/docs/algorithms/kem/classic_mceliece.yml
+++ b/docs/algorithms/kem/classic_mceliece.yml
@@ -19,7 +19,7 @@ website: https://classic.mceliece.org
 nist-round: 3
 spec-version: SUPERCOP-20191221
 spdx-license-identifier: Public domain
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - SUPERCOP-20191221 "vec" and "avx" implementations
 parameter-sets:
@@ -348,3 +348,4 @@ parameter-sets:
     no-secret-dependent-branching-claimed: false
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: true
+auxiliary-submitters: []
diff --git a/docs/algorithms/kem/hqc.md b/docs/algorithms/kem/hqc.md
index 861ec9dc6..e1ef17a05 100644
--- a/docs/algorithms/kem/hqc.md
+++ b/docs/algorithms/kem/hqc.md
@@ -5,7 +5,7 @@
 - **Principal submitters**: Carlos Aguilar Melchor, Nicolas Aragon, Slim Bettaieb, Olivier Blazy, Jurjen Bos, Jean-Christophe Deneuville, Philippe Gaborit, Edoardo Persichetti, Jean-Marc Robert, Pascal Véron, Gilles Zémor, Loïc Bidoux.
 - **Authors' website**: https://pqc-hqc.org/
 - **Specification version**: NIST Round 3 submission.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - https://github.com/jschanck/package-pqclean/tree/29f79e72/hqc, which takes it from:
   - submission 2020-10-01 at https://pqc-hqc.org/implementation.html
 - **Implementation license (SPDX-Identifier)**: Public domain.
diff --git a/docs/algorithms/kem/hqc.yml b/docs/algorithms/kem/hqc.yml
index 5a78f15d9..f87df5a5c 100644
--- a/docs/algorithms/kem/hqc.yml
+++ b/docs/algorithms/kem/hqc.yml
@@ -18,7 +18,7 @@ website: https://pqc-hqc.org/
 nist-round: 3
 spec-version: NIST Round 3 submission
 spdx-license-identifier: Public domain
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - https://github.com/jschanck/package-pqclean/tree/29f79e72/hqc
 - submission 2020-10-01 at https://pqc-hqc.org/implementation.html
diff --git a/docs/algorithms/kem/ntru.md b/docs/algorithms/kem/ntru.md
index a5d7d53dc..42eefff3b 100644
--- a/docs/algorithms/kem/ntru.md
+++ b/docs/algorithms/kem/ntru.md
@@ -6,7 +6,7 @@
 - **Auxiliary submitters**: Cong Chen, Oussama Danba, Jeffrey Hoffstein, Andreas Hülsing, Joost Rijneveld, Tsunekazu Saito, Peter Schwabe, William Whyte, Keita Xagawa, Takashi Yamakawa, Zhenfei Zhang.
 - **Authors' website**: https://ntru.org/
 - **Specification version**: NIST Round 3 submission.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - https://github.com/jschanck/ntru/tree/a43a4457
 - **Implementation license (SPDX-Identifier)**: CC0-1.0.
 
diff --git a/docs/algorithms/kem/ntru.yml b/docs/algorithms/kem/ntru.yml
index 19578a4da..c4ac02067 100644
--- a/docs/algorithms/kem/ntru.yml
+++ b/docs/algorithms/kem/ntru.yml
@@ -19,7 +19,7 @@ website: https://ntru.org/
 nist-round: 3
 spec-version: NIST Round 3 submission
 spdx-license-identifier: CC0-1.0
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - https://github.com/jschanck/ntru/tree/a43a4457
 parameter-sets:
diff --git a/docs/algorithms/kem/ntruprime.md b/docs/algorithms/kem/ntruprime.md
index 1c1d5aa98..a25add8fb 100644
--- a/docs/algorithms/kem/ntruprime.md
+++ b/docs/algorithms/kem/ntruprime.md
@@ -5,7 +5,7 @@
 - **Principal submitters**: Daniel J. Bernstein, Chitchanok Chuengsatiansup, Tanja Lange, Christine van Vredendaal.
 - **Authors' website**: https://ntruprime.cr.yp.to
 - **Specification version**: supercop-20200826.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - https://github.com/jschanck/package-pqclean/tree/4d9f08c3/ntruprime, which takes it from:
   - supercop-20210604
 - **Implementation license (SPDX-Identifier)**: Public domain.
diff --git a/docs/algorithms/kem/ntruprime.yml b/docs/algorithms/kem/ntruprime.yml
index 6a26cbc71..e702eed76 100644
--- a/docs/algorithms/kem/ntruprime.yml
+++ b/docs/algorithms/kem/ntruprime.yml
@@ -10,7 +10,7 @@ website: https://ntruprime.cr.yp.to
 nist-round: 3
 spec-version: supercop-20200826
 spdx-license-identifier: Public domain
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - https://github.com/jschanck/package-pqclean/tree/4d9f08c3/ntruprime
 - supercop-20210604
diff --git a/docs/algorithms/kem/saber.md b/docs/algorithms/kem/saber.md
index 329178b92..fb4560470 100644
--- a/docs/algorithms/kem/saber.md
+++ b/docs/algorithms/kem/saber.md
@@ -5,7 +5,7 @@
 - **Principal submitters**: Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, Frederik Vercauteren.
 - **Authors' website**: https://www.esat.kuleuven.be/cosic/pqcrypto/saber/
 - **Specification version**: NIST Round 3 submission.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber, which takes it from:
   - https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
 - **Implementation license (SPDX-Identifier)**: Public domain.
diff --git a/docs/algorithms/kem/saber.yml b/docs/algorithms/kem/saber.yml
index 96437f771..50f4877fe 100644
--- a/docs/algorithms/kem/saber.yml
+++ b/docs/algorithms/kem/saber.yml
@@ -10,7 +10,7 @@ website: https://www.esat.kuleuven.be/cosic/pqcrypto/saber/
 nist-round: 3
 spec-version: NIST Round 3 submission
 spdx-license-identifier: Public domain
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber
 - https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
diff --git a/docs/algorithms/sig/falcon.md b/docs/algorithms/sig/falcon.md
index 064beab3e..7823734c9 100644
--- a/docs/algorithms/sig/falcon.md
+++ b/docs/algorithms/sig/falcon.md
@@ -6,7 +6,7 @@
 - **Auxiliary submitters**: Pierre-Alain Fouque, Jeffrey Hoffstein, Paul Kirchner, Vadim Lyubashevsky, Thomas Pornin, Thomas Ricosset, Gregor Seiler, William Whyte, Zhenfei Zhang.
 - **Authors' website**: https://falcon-sign.info
 - **Specification version**: v1.2.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - https://github.com/jschanck/package-pqclean/tree/cea1fa5a/falcon, which takes it from:
   - supercop-20201018
 - **Implementation license (SPDX-Identifier)**: CC0-1.0.
diff --git a/docs/algorithms/sig/falcon.yml b/docs/algorithms/sig/falcon.yml
index 3414de2fd..5dfa1fdbe 100644
--- a/docs/algorithms/sig/falcon.yml
+++ b/docs/algorithms/sig/falcon.yml
@@ -17,7 +17,7 @@ website: https://falcon-sign.info
 nist-round: 3
 spec-version: v1.2
 spdx-license-identifier: CC0-1.0
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - https://github.com/jschanck/package-pqclean/tree/cea1fa5a/falcon
 - supercop-20201018
diff --git a/docs/algorithms/sig/rainbow.md b/docs/algorithms/sig/rainbow.md
index 204ce8d80..a720a6308 100644
--- a/docs/algorithms/sig/rainbow.md
+++ b/docs/algorithms/sig/rainbow.md
@@ -6,7 +6,7 @@
 - **Auxiliary submitters**: Ming-Shing Chen, Matthias Kannwischer, Jacques Patarin, Albrecht Petzoldt, Dieter Schmidt, Bo-Yin Yang.
 - **Authors' website**: https://www.pqcrainbow.org/
 - **Specification version**: NIST Round 3 submission.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - https://github.com/fast-crypto-lab/rainbow-submission-round2/commit/173ada0e077e1b9dbd8e4a78994f87acc0c92263
 - **Implementation license (SPDX-Identifier)**: CC0-1.0.
 
diff --git a/docs/algorithms/sig/rainbow.yml b/docs/algorithms/sig/rainbow.yml
index da9ec8dfc..6a8ab8fcd 100644
--- a/docs/algorithms/sig/rainbow.yml
+++ b/docs/algorithms/sig/rainbow.yml
@@ -14,7 +14,7 @@ website: https://www.pqcrainbow.org/
 nist-round: 3
 spec-version: NIST Round 3 submission
 spdx-license-identifier: CC0-1.0
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - https://github.com/fast-crypto-lab/rainbow-submission-round2/commit/173ada0e077e1b9dbd8e4a78994f87acc0c92263
 parameter-sets:
diff --git a/docs/algorithms/sig/sphincs.md b/docs/algorithms/sig/sphincs.md
index 3002461c9..d949dd467 100644
--- a/docs/algorithms/sig/sphincs.md
+++ b/docs/algorithms/sig/sphincs.md
@@ -6,7 +6,7 @@
 - **Auxiliary submitters**: Jean-Philippe Aumasson, Daniel J. Bernstein,, Christoph Dobraunig, Maria Eichlseder, Scott Fluhrer, Stefan-Lukas Gazdag, Panos Kampanakis, Stefan Kölbl, Tanja Lange, Martin M. Lauridsen, Florian Mendel, Ruben Niederhagen, Christian Rechberger, Joost Rijneveld, Peter Schwabe.
 - **Authors' website**: https://sphincs.org/
 - **Specification version**: NIST Round 3 submission.
-- **Implementation source**: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24, which takes it from:
+- **Implementation source**: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2, which takes it from:
   - https://github.com/sphincs/sphincsplus
 - **Implementation license (SPDX-Identifier)**: CC0-1.0.
 
diff --git a/docs/algorithms/sig/sphincs.yml b/docs/algorithms/sig/sphincs.yml
index 011ac2031..563975ae5 100644
--- a/docs/algorithms/sig/sphincs.yml
+++ b/docs/algorithms/sig/sphincs.yml
@@ -23,7 +23,7 @@ website: https://sphincs.org/
 nist-round: 3
 spec-version: NIST Round 3 submission
 spdx-license-identifier: CC0-1.0
-upstream: https://github.com/PQClean/PQClean/commit/89d34613364deca88659f6c2dd38708279c6bd24
+upstream: https://github.com/PQClean/PQClean/commit/6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
 upstream-ancestors:
 - https://github.com/sphincs/sphincsplus
 parameter-sets:
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index a46be50e2..f9edbca8d 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -3,7 +3,7 @@ upstreams:
     name: pqclean
     git_url: https://github.com/PQClean/PQClean.git
     git_branch: master
-    git_commit: 89d34613364deca88659f6c2dd38708279c6bd24
+    git_commit: 6c1ea921ee4a06a6b50c742ca540bb9b5e51aee2
     kem_meta_path: 'crypto_kem/{pqclean_scheme}/META.yml'
     sig_meta_path: 'crypto_sign/{pqclean_scheme}/META.yml'
     kem_scheme_path: 'crypto_kem/{pqclean_scheme}'
diff --git a/src/sig/falcon/CMakeLists.txt b/src/sig/falcon/CMakeLists.txt
index 9479bcb64..170bdf8c6 100644
--- a/src/sig/falcon/CMakeLists.txt
+++ b/src/sig/falcon/CMakeLists.txt
@@ -6,7 +6,7 @@
 set(_FALCON_OBJS "")
 
 if(OQS_ENABLE_SIG_falcon_512)
-    add_library(falcon_512_clean OBJECT sig_falcon_512.c pqclean_falcon-512_clean/codec.c pqclean_falcon-512_clean/common.c pqclean_falcon-512_clean/fft.c pqclean_falcon-512_clean/fpr.c pqclean_falcon-512_clean/keygen.c pqclean_falcon-512_clean/pqclean.c pqclean_falcon-512_clean/rng.c pqclean_falcon-512_clean/sign.c pqclean_falcon-512_clean/vrfy.c)
+    add_library(falcon_512_clean OBJECT sig_falcon_512.c pqclean_falcon-512_clean/codec.c pqclean_falcon-512_clean/common.c pqclean_falcon-512_clean/fft.c pqclean_falcon-512_clean/fpr.c pqclean_falcon-512_clean/inner.c pqclean_falcon-512_clean/keygen.c pqclean_falcon-512_clean/pqclean.c pqclean_falcon-512_clean/rng.c pqclean_falcon-512_clean/sign.c pqclean_falcon-512_clean/vrfy.c)
     target_include_directories(falcon_512_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-512_clean)
     target_include_directories(falcon_512_clean PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_512_clean>)
@@ -21,7 +21,7 @@ if(OQS_ENABLE_SIG_falcon_512_avx2)
 endif()
 
 if(OQS_ENABLE_SIG_falcon_1024)
-    add_library(falcon_1024_clean OBJECT sig_falcon_1024.c pqclean_falcon-1024_clean/codec.c pqclean_falcon-1024_clean/common.c pqclean_falcon-1024_clean/fft.c pqclean_falcon-1024_clean/fpr.c pqclean_falcon-1024_clean/keygen.c pqclean_falcon-1024_clean/pqclean.c pqclean_falcon-1024_clean/rng.c pqclean_falcon-1024_clean/sign.c pqclean_falcon-1024_clean/vrfy.c)
+    add_library(falcon_1024_clean OBJECT sig_falcon_1024.c pqclean_falcon-1024_clean/codec.c pqclean_falcon-1024_clean/common.c pqclean_falcon-1024_clean/fft.c pqclean_falcon-1024_clean/fpr.c pqclean_falcon-1024_clean/inner.c pqclean_falcon-1024_clean/keygen.c pqclean_falcon-1024_clean/pqclean.c pqclean_falcon-1024_clean/rng.c pqclean_falcon-1024_clean/sign.c pqclean_falcon-1024_clean/vrfy.c)
     target_include_directories(falcon_1024_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-1024_clean)
     target_include_directories(falcon_1024_clean PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_1024_clean>)
diff --git a/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c b/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c
index 8ef93bf83..6888fe00a 100644
--- a/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c
+++ b/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c
@@ -267,7 +267,7 @@ PQCLEAN_FALCON1024_AVX2_expand_privkey(fpr *expanded_key,
     PQCLEAN_FALCON1024_AVX2_poly_neg(rF, logn);
 
     /*
-     * The Gram matrix is G = B·B*. Formulas are:
+     * The Gram matrix is G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
@@ -781,7 +781,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     PQCLEAN_FALCON1024_AVX2_poly_neg(b11, logn);
 
     /*
-     * Compute the Gram matrix G = B·B*. Formulas are:
+     * Compute the Gram matrix G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/fpr.c b/src/sig/falcon/pqclean_falcon-1024_clean/fpr.c
index 091462a71..669c825ee 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/fpr.c
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/fpr.c
@@ -78,6 +78,66 @@
         (e) += (int)(nt); \
     } while (0)
 
+uint64_t
+fpr_ursh(uint64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+int64_t
+fpr_irsh(int64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+uint64_t
+fpr_ulsh(uint64_t x, int n) {
+    x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
+    return x << (n & 31);
+}
+
+fpr
+FPR(int s, int e, uint64_t m) {
+    fpr x;
+    uint32_t t;
+    unsigned f;
+
+    /*
+     * If e >= -1076, then the value is "normal"; otherwise, it
+     * should be a subnormal, which we clamp down to zero.
+     */
+    e += 1076;
+    t = (uint32_t)e >> 31;
+    m &= (uint64_t)t - 1;
+
+    /*
+     * If m = 0 then we want a zero; make e = 0 too, but conserve
+     * the sign.
+     */
+    t = (uint32_t)(m >> 54);
+    e &= -(int)t;
+
+    /*
+     * The 52 mantissa bits come from m. Value m has its top bit set
+     * (unless it is a zero); we leave it "as is": the top bit will
+     * increment the exponent by 1, except when m = 0, which is
+     * exactly what we want.
+     */
+    x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
+
+    /*
+     * Rounding: if the low three bits of m are 011, 110 or 111,
+     * then the value should be incremented to get the next
+     * representable value. This implements the usual
+     * round-to-nearest rule (with preference to even values in case
+     * of a tie). Note that the increment may make a carry spill
+     * into the exponent field, which is again exactly what we want
+     * in that case.
+     */
+    f = (unsigned)m & 7U;
+    x += (0xC8U >> f) & 1;
+    return x;
+}
 
 fpr
 fpr_scaled(int64_t i, int sc) {
@@ -134,7 +194,131 @@ fpr_scaled(int64_t i, int sc) {
     return FPR(s, e, m);
 }
 
+fpr
+fpr_of(int64_t i) {
+    return fpr_scaled(i, 0);
+}
 
+int64_t
+fpr_rint(fpr x) {
+    uint64_t m, d;
+    int e;
+    uint32_t s, dd, f;
+
+    /*
+     * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
+     * thus extract the mantissa as a 63-bit integer, then right-shift
+     * it as needed.
+     */
+    m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    e = 1085 - ((int)(x >> 52) & 0x7FF);
+
+    /*
+     * If a shift of more than 63 bits is needed, then simply set m
+     * to zero. This also covers the case of an input operand equal
+     * to zero.
+     */
+    m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
+    e &= 63;
+
+    /*
+     * Right-shift m as needed. Shift count is e. Proper rounding
+     * mandates that:
+     *   - If the highest dropped bit is zero, then round low.
+     *   - If the highest dropped bit is one, and at least one of the
+     *     other dropped bits is one, then round up.
+     *   - If the highest dropped bit is one, and all other dropped
+     *     bits are zero, then round up if the lowest kept bit is 1,
+     *     or low otherwise (i.e. ties are broken by "rounding to even").
+     *
+     * We thus first extract a word consisting of all the dropped bit
+     * AND the lowest kept bit; then we shrink it down to three bits,
+     * the lowest being "sticky".
+     */
+    d = fpr_ulsh(m, 63 - e);
+    dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
+    f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
+    m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
+
+    /*
+     * Apply the sign bit.
+     */
+    s = (uint32_t)(x >> 63);
+    return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
+}
+
+int64_t
+fpr_floor(fpr x) {
+    uint64_t t;
+    int64_t xi;
+    int e, cc;
+
+    /*
+     * We extract the integer as a _signed_ 64-bit integer with
+     * a scaling factor. Since we assume that the value fits
+     * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
+     * absolute value to make it in the 2^62..2^63-1 range: we
+     * will only need a right-shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    t = x >> 63;
+    xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
+                   & (((uint64_t)1 << 63) - 1));
+    xi = (xi ^ -(int64_t)t) + (int64_t)t;
+    cc = 1085 - e;
+
+    /*
+     * We perform an arithmetic right-shift on the value. This
+     * applies floor() semantics on both positive and negative values
+     * (rounding toward minus infinity).
+     */
+    xi = fpr_irsh(xi, cc & 63);
+
+    /*
+     * If the true shift count was 64 or more, then we should instead
+     * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
+     * case: -0 will be floored to -1, not 0 (whether this is correct
+     * is debatable; in any case, the other functions normalize zero
+     * to +0).
+     *
+     * For an input of zero, the non-shifted xi was incorrect (we used
+     * a top implicit bit of value 1, not 0), but this does not matter
+     * since this operation will clamp it down.
+     */
+    xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
+    return xi;
+}
+
+int64_t
+fpr_trunc(fpr x) {
+    uint64_t t, xu;
+    int e, cc;
+
+    /*
+     * Extract the absolute value. Since we assume that the value
+     * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
+     * the absolute value into the 2^62..2^63-1 range, and then
+     * do a right shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    cc = 1085 - e;
+    xu = fpr_ursh(xu, cc & 63);
+
+    /*
+     * If the exponent is too low (cc > 63), then the shift was wrong
+     * and we must clamp the value to 0. This also covers the case
+     * of an input equal to zero.
+     */
+    xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
+
+    /*
+     * Apply back the sign, if the source value is negative.
+     */
+    t = x >> 63;
+    xu = (xu ^ -t) + t;
+    return *(int64_t *)&xu;
+}
 
 fpr
 fpr_add(fpr x, fpr y) {
@@ -243,7 +427,42 @@ fpr_add(fpr x, fpr y) {
     return FPR(sx, ex, xu);
 }
 
+fpr
+fpr_sub(fpr x, fpr y) {
+    y ^= (uint64_t)1 << 63;
+    return fpr_add(x, y);
+}
 
+fpr
+fpr_neg(fpr x) {
+    x ^= (uint64_t)1 << 63;
+    return x;
+}
+
+fpr
+fpr_half(fpr x) {
+    /*
+     * To divide a value by 2, we just have to subtract 1 from its
+     * exponent, but we have to take care of zero.
+     */
+    uint32_t t;
+
+    x -= (uint64_t)1 << 52;
+    t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
+    x &= (uint64_t)t - 1;
+    return x;
+}
+
+fpr
+fpr_double(fpr x) {
+    /*
+     * To double a value, we just increment by one the exponent. We
+     * don't care about infinites or NaNs; however, 0 is a
+     * special case.
+     */
+    x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
+    return x;
+}
 
 fpr
 fpr_mul(fpr x, fpr y) {
@@ -340,7 +559,10 @@ fpr_mul(fpr x, fpr y) {
     return FPR(s, e, zu);
 }
 
-
+fpr
+fpr_sqr(fpr x) {
+    return fpr_mul(x, x);
+}
 
 fpr
 fpr_div(fpr x, fpr y) {
@@ -428,7 +650,10 @@ fpr_div(fpr x, fpr y) {
     return FPR(s, e, q);
 }
 
-
+fpr
+fpr_inv(fpr x) {
+    return fpr_div(4607182418800017408u, x);
+}
 
 fpr
 fpr_sqrt(fpr x) {
@@ -506,6 +731,37 @@ fpr_sqrt(fpr x) {
     return FPR(0, e, q);
 }
 
+int
+fpr_lt(fpr x, fpr y) {
+    /*
+     * If both x and y are positive, then a signed comparison yields
+     * the proper result:
+     *   - For positive values, the order is preserved.
+     *   - The sign bit is at the same place as in integers, so
+     *     sign is preserved.
+     * Moreover, we can compute [x < y] as sgn(x-y) and the computation
+     * of x-y will not overflow.
+     *
+     * If the signs differ, then sgn(x) gives the proper result.
+     *
+     * If both x and y are negative, then the order is reversed.
+     * Hence [x < y] = sgn(y-x). We must compute this separately from
+     * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
+     * case x = y properly.
+     */
+    int cc0, cc1;
+    int64_t sx;
+    int64_t sy;
+
+    sx = *(int64_t *)&x;
+    sy = *(int64_t *)&y;
+    sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
+
+    cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
+    cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
+
+    return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
+}
 
 uint64_t
 fpr_expm_p63(fpr x, fpr ccs) {
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/fpr.h b/src/sig/falcon/pqclean_falcon-1024_clean/fpr.h
index dd7e15c22..a1c122275 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/fpr.h
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/fpr.h
@@ -126,11 +126,8 @@ typedef uint64_t fpr;
  *
  * Shift count n MUST be in the 0..63 range.
  */
-static inline uint64_t
-fpr_ursh(uint64_t x, int n) {
-    x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
-    return x >> (n & 31);
-}
+#define fpr_ursh   PQCLEAN_FALCON1024_CLEAN_fpr_ursh
+uint64_t fpr_ursh(uint64_t x, int n);
 
 /*
  * Right-shift a 64-bit signed value by a possibly secret shift count
@@ -138,11 +135,8 @@ fpr_ursh(uint64_t x, int n) {
  *
  * Shift count n MUST be in the 0..63 range.
  */
-static inline int64_t
-fpr_irsh(int64_t x, int n) {
-    x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
-    return x >> (n & 31);
-}
+#define fpr_irsh   PQCLEAN_FALCON1024_CLEAN_fpr_irsh
+int64_t fpr_irsh(int64_t x, int n);
 
 /*
  * Left-shift a 64-bit unsigned value by a possibly secret shift count
@@ -150,11 +144,8 @@ fpr_irsh(int64_t x, int n) {
  *
  * Shift count n MUST be in the 0..63 range.
  */
-static inline uint64_t
-fpr_ulsh(uint64_t x, int n) {
-    x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
-    return x << (n & 31);
-}
+#define fpr_ulsh   PQCLEAN_FALCON1024_CLEAN_fpr_ulsh
+uint64_t fpr_ulsh(uint64_t x, int n);
 
 /*
  * Expectations:
@@ -171,56 +162,15 @@ fpr_ulsh(uint64_t x, int n) {
  * If e >= -1076 and e != 0, m must be within the expected range
  * (2^54 to 2^55-1).
  */
-static inline fpr
-FPR(int s, int e, uint64_t m) {
-    fpr x;
-    uint32_t t;
-    unsigned f;
+#define FPR   PQCLEAN_FALCON1024_CLEAN_FPR
+fpr FPR(int s, int e, uint64_t m);
 
-    /*
-     * If e >= -1076, then the value is "normal"; otherwise, it
-     * should be a subnormal, which we clamp down to zero.
-     */
-    e += 1076;
-    t = (uint32_t)e >> 31;
-    m &= (uint64_t)t - 1;
-
-    /*
-     * If m = 0 then we want a zero; make e = 0 too, but conserve
-     * the sign.
-     */
-    t = (uint32_t)(m >> 54);
-    e &= -(int)t;
-
-    /*
-     * The 52 mantissa bits come from m. Value m has its top bit set
-     * (unless it is a zero); we leave it "as is": the top bit will
-     * increment the exponent by 1, except when m = 0, which is
-     * exactly what we want.
-     */
-    x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
-
-    /*
-     * Rounding: if the low three bits of m are 011, 110 or 111,
-     * then the value should be incremented to get the next
-     * representable value. This implements the usual
-     * round-to-nearest rule (with preference to even values in case
-     * of a tie). Note that the increment may make a carry spill
-     * into the exponent field, which is again exactly what we want
-     * in that case.
-     */
-    f = (unsigned)m & 7U;
-    x += (0xC8U >> f) & 1;
-    return x;
-}
 
 #define fpr_scaled   PQCLEAN_FALCON1024_CLEAN_fpr_scaled
 fpr fpr_scaled(int64_t i, int sc);
 
-static inline fpr
-fpr_of(int64_t i) {
-    return fpr_scaled(i, 0);
-}
+#define fpr_of   PQCLEAN_FALCON1024_CLEAN_fpr_of
+fpr fpr_of(int64_t i);
 
 static const fpr fpr_q = 4667981563525332992;
 static const fpr fpr_inverse_of_q = 4545632735260551042;
@@ -244,217 +194,47 @@ static const fpr fpr_ptwo63m1 = 4890909195324358656;
 static const fpr fpr_mtwo63m1 = 14114281232179134464U;
 static const fpr fpr_ptwo63 = 4890909195324358656;
 
-static inline int64_t
-fpr_rint(fpr x) {
-    uint64_t m, d;
-    int e;
-    uint32_t s, dd, f;
+#define fpr_rint   PQCLEAN_FALCON1024_CLEAN_fpr_rint
+int64_t fpr_rint(fpr x);
 
-    /*
-     * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
-     * thus extract the mantissa as a 63-bit integer, then right-shift
-     * it as needed.
-     */
-    m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-    e = 1085 - ((int)(x >> 52) & 0x7FF);
+#define fpr_floor   PQCLEAN_FALCON1024_CLEAN_fpr_floor
+int64_t fpr_floor(fpr x);
 
-    /*
-     * If a shift of more than 63 bits is needed, then simply set m
-     * to zero. This also covers the case of an input operand equal
-     * to zero.
-     */
-    m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
-    e &= 63;
-
-    /*
-     * Right-shift m as needed. Shift count is e. Proper rounding
-     * mandates that:
-     *   - If the highest dropped bit is zero, then round low.
-     *   - If the highest dropped bit is one, and at least one of the
-     *     other dropped bits is one, then round up.
-     *   - If the highest dropped bit is one, and all other dropped
-     *     bits are zero, then round up if the lowest kept bit is 1,
-     *     or low otherwise (i.e. ties are broken by "rounding to even").
-     *
-     * We thus first extract a word consisting of all the dropped bit
-     * AND the lowest kept bit; then we shrink it down to three bits,
-     * the lowest being "sticky".
-     */
-    d = fpr_ulsh(m, 63 - e);
-    dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
-    f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
-    m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
-
-    /*
-     * Apply the sign bit.
-     */
-    s = (uint32_t)(x >> 63);
-    return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
-}
-
-static inline int64_t
-fpr_floor(fpr x) {
-    uint64_t t;
-    int64_t xi;
-    int e, cc;
-
-    /*
-     * We extract the integer as a _signed_ 64-bit integer with
-     * a scaling factor. Since we assume that the value fits
-     * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
-     * absolute value to make it in the 2^62..2^63-1 range: we
-     * will only need a right-shift afterwards.
-     */
-    e = (int)(x >> 52) & 0x7FF;
-    t = x >> 63;
-    xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
-                   & (((uint64_t)1 << 63) - 1));
-    xi = (xi ^ -(int64_t)t) + (int64_t)t;
-    cc = 1085 - e;
-
-    /*
-     * We perform an arithmetic right-shift on the value. This
-     * applies floor() semantics on both positive and negative values
-     * (rounding toward minus infinity).
-     */
-    xi = fpr_irsh(xi, cc & 63);
-
-    /*
-     * If the true shift count was 64 or more, then we should instead
-     * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
-     * case: -0 will be floored to -1, not 0 (whether this is correct
-     * is debatable; in any case, the other functions normalize zero
-     * to +0).
-     *
-     * For an input of zero, the non-shifted xi was incorrect (we used
-     * a top implicit bit of value 1, not 0), but this does not matter
-     * since this operation will clamp it down.
-     */
-    xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
-    return xi;
-}
-
-static inline int64_t
-fpr_trunc(fpr x) {
-    uint64_t t, xu;
-    int e, cc;
-
-    /*
-     * Extract the absolute value. Since we assume that the value
-     * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
-     * the absolute value into the 2^62..2^63-1 range, and then
-     * do a right shift afterwards.
-     */
-    e = (int)(x >> 52) & 0x7FF;
-    xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-    cc = 1085 - e;
-    xu = fpr_ursh(xu, cc & 63);
-
-    /*
-     * If the exponent is too low (cc > 63), then the shift was wrong
-     * and we must clamp the value to 0. This also covers the case
-     * of an input equal to zero.
-     */
-    xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
-
-    /*
-     * Apply back the sign, if the source value is negative.
-     */
-    t = x >> 63;
-    xu = (xu ^ -t) + t;
-    return *(int64_t *)&xu;
-}
+#define fpr_trunc   PQCLEAN_FALCON1024_CLEAN_fpr_trunc
+int64_t fpr_trunc(fpr x);
 
 #define fpr_add   PQCLEAN_FALCON1024_CLEAN_fpr_add
 fpr fpr_add(fpr x, fpr y);
 
-static inline fpr
-fpr_sub(fpr x, fpr y) {
-    y ^= (uint64_t)1 << 63;
-    return fpr_add(x, y);
-}
+#define fpr_sub   PQCLEAN_FALCON1024_CLEAN_fpr_sub
+fpr fpr_sub(fpr x, fpr y);
 
-static inline fpr
-fpr_neg(fpr x) {
-    x ^= (uint64_t)1 << 63;
-    return x;
-}
+#define fpr_neg   PQCLEAN_FALCON1024_CLEAN_fpr_neg
+fpr fpr_neg(fpr x);
 
-static inline fpr
-fpr_half(fpr x) {
-    /*
-     * To divide a value by 2, we just have to subtract 1 from its
-     * exponent, but we have to take care of zero.
-     */
-    uint32_t t;
+#define fpr_half   PQCLEAN_FALCON1024_CLEAN_fpr_half
+fpr fpr_half(fpr x);
 
-    x -= (uint64_t)1 << 52;
-    t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
-    x &= (uint64_t)t - 1;
-    return x;
-}
-
-static inline fpr
-fpr_double(fpr x) {
-    /*
-     * To double a value, we just increment by one the exponent. We
-     * don't care about infinites or NaNs; however, 0 is a
-     * special case.
-     */
-    x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
-    return x;
-}
+#define fpr_double   PQCLEAN_FALCON1024_CLEAN_fpr_double
+fpr fpr_double(fpr x);
 
 #define fpr_mul   PQCLEAN_FALCON1024_CLEAN_fpr_mul
 fpr fpr_mul(fpr x, fpr y);
 
-static inline fpr
-fpr_sqr(fpr x) {
-    return fpr_mul(x, x);
-}
+#define fpr_sqr   PQCLEAN_FALCON1024_CLEAN_fpr_sqr
+fpr fpr_sqr(fpr x);
 
 #define fpr_div   PQCLEAN_FALCON1024_CLEAN_fpr_div
 fpr fpr_div(fpr x, fpr y);
 
-static inline fpr
-fpr_inv(fpr x) {
-    return fpr_div(4607182418800017408u, x);
-}
+#define fpr_inv   PQCLEAN_FALCON1024_CLEAN_fpr_inv
+fpr fpr_inv(fpr x);
 
 #define fpr_sqrt   PQCLEAN_FALCON1024_CLEAN_fpr_sqrt
 fpr fpr_sqrt(fpr x);
 
-static inline int
-fpr_lt(fpr x, fpr y) {
-    /*
-     * If both x and y are positive, then a signed comparison yields
-     * the proper result:
-     *   - For positive values, the order is preserved.
-     *   - The sign bit is at the same place as in integers, so
-     *     sign is preserved.
-     * Moreover, we can compute [x < y] as sgn(x-y) and the computation
-     * of x-y will not overflow.
-     *
-     * If the signs differ, then sgn(x) gives the proper result.
-     *
-     * If both x and y are negative, then the order is reversed.
-     * Hence [x < y] = sgn(y-x). We must compute this separately from
-     * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
-     * case x = y properly.
-     */
-    int cc0, cc1;
-    int64_t sx;
-    int64_t sy;
-
-    sx = *(int64_t *)&x;
-    sy = *(int64_t *)&y;
-    sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
-
-    cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
-    cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
-
-    return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
-}
+#define fpr_lt   PQCLEAN_FALCON1024_CLEAN_fpr_lt
+int fpr_lt(fpr x, fpr y);
 
 /*
  * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/inner.c b/src/sig/falcon/pqclean_falcon-1024_clean/inner.c
new file mode 100755
index 000000000..f5c269eda
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/inner.c
@@ -0,0 +1,70 @@
+#include "inner.h"
+
+/*
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ */
+
+unsigned set_fpu_cw(unsigned x) {
+    return x;
+}
+
+
+uint64_t prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCON1024_CLEAN_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+
+unsigned prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCON1024_CLEAN_prng_refill(p);
+    }
+    return v;
+}
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/inner.h b/src/sig/falcon/pqclean_falcon-1024_clean/inner.h
index 5b0477ac1..886f51a67 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/inner.h
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/inner.h
@@ -99,12 +99,8 @@
  * targets other than 32-bit x86, or when the native 'double' type is
  * not used, the set_fpu_cw() function does nothing at all.
  */
-static inline unsigned
-set_fpu_cw(unsigned x) {
-    return x;
-}
-
-
+#define set_fpu_cw PQCLEAN_FALCON1024_CLEAN_set_fpu_cw
+unsigned set_fpu_cw(unsigned x);
 
 
 /* ==================================================================== */
@@ -496,50 +492,14 @@ void PQCLEAN_FALCON1024_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len);
 /*
  * Get a 64-bit random value from a PRNG.
  */
-static inline uint64_t
-prng_get_u64(prng *p) {
-    size_t u;
-
-    /*
-     * If there are less than 9 bytes in the buffer, we refill it.
-     * This means that we may drop the last few bytes, but this allows
-     * for faster extraction code. Also, it means that we never leave
-     * an empty buffer.
-     */
-    u = p->ptr;
-    if (u >= (sizeof p->buf.d) - 9) {
-        PQCLEAN_FALCON1024_CLEAN_prng_refill(p);
-        u = 0;
-    }
-    p->ptr = u + 8;
-
-    /*
-     * On systems that use little-endian encoding and allow
-     * unaligned accesses, we can simply read the data where it is.
-     */
-    return (uint64_t)p->buf.d[u + 0]
-           | ((uint64_t)p->buf.d[u + 1] << 8)
-           | ((uint64_t)p->buf.d[u + 2] << 16)
-           | ((uint64_t)p->buf.d[u + 3] << 24)
-           | ((uint64_t)p->buf.d[u + 4] << 32)
-           | ((uint64_t)p->buf.d[u + 5] << 40)
-           | ((uint64_t)p->buf.d[u + 6] << 48)
-           | ((uint64_t)p->buf.d[u + 7] << 56);
-}
+#define prng_get_u64 PQCLEAN_FALCON1024_CLEAN_prng_get_u64
+uint64_t prng_get_u64(prng *p);
 
 /*
  * Get an 8-bit random value from a PRNG.
  */
-static inline unsigned
-prng_get_u8(prng *p) {
-    unsigned v;
-
-    v = p->buf.d[p->ptr ++];
-    if (p->ptr == sizeof p->buf.d) {
-        PQCLEAN_FALCON1024_CLEAN_prng_refill(p);
-    }
-    return v;
-}
+#define prng_get_u8 PQCLEAN_FALCON1024_CLEAN_prng_get_u8
+unsigned prng_get_u8(prng *p);
 
 /* ==================================================================== */
 /*
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/sign.c b/src/sig/falcon/pqclean_falcon-1024_clean/sign.c
index fb05cdad0..0baa9148e 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/sign.c
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/sign.c
@@ -267,7 +267,7 @@ PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key,
     PQCLEAN_FALCON1024_CLEAN_poly_neg(rF, logn);
 
     /*
-     * The Gram matrix is G = B·B*. Formulas are:
+     * The Gram matrix is G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
@@ -788,7 +788,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     PQCLEAN_FALCON1024_CLEAN_poly_neg(b11, logn);
 
     /*
-     * Compute the Gram matrix G = B·B*. Formulas are:
+     * Compute the Gram matrix G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
diff --git a/src/sig/falcon/pqclean_falcon-512_avx2/sign.c b/src/sig/falcon/pqclean_falcon-512_avx2/sign.c
index 1b6cad3f2..623f618ec 100644
--- a/src/sig/falcon/pqclean_falcon-512_avx2/sign.c
+++ b/src/sig/falcon/pqclean_falcon-512_avx2/sign.c
@@ -267,7 +267,7 @@ PQCLEAN_FALCON512_AVX2_expand_privkey(fpr *expanded_key,
     PQCLEAN_FALCON512_AVX2_poly_neg(rF, logn);
 
     /*
-     * The Gram matrix is G = B·B*. Formulas are:
+     * The Gram matrix is G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
@@ -781,7 +781,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     PQCLEAN_FALCON512_AVX2_poly_neg(b11, logn);
 
     /*
-     * Compute the Gram matrix G = B·B*. Formulas are:
+     * Compute the Gram matrix G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/fpr.c b/src/sig/falcon/pqclean_falcon-512_clean/fpr.c
index 091462a71..669c825ee 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/fpr.c
+++ b/src/sig/falcon/pqclean_falcon-512_clean/fpr.c
@@ -78,6 +78,66 @@
         (e) += (int)(nt); \
     } while (0)
 
+uint64_t
+fpr_ursh(uint64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+int64_t
+fpr_irsh(int64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+uint64_t
+fpr_ulsh(uint64_t x, int n) {
+    x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
+    return x << (n & 31);
+}
+
+fpr
+FPR(int s, int e, uint64_t m) {
+    fpr x;
+    uint32_t t;
+    unsigned f;
+
+    /*
+     * If e >= -1076, then the value is "normal"; otherwise, it
+     * should be a subnormal, which we clamp down to zero.
+     */
+    e += 1076;
+    t = (uint32_t)e >> 31;
+    m &= (uint64_t)t - 1;
+
+    /*
+     * If m = 0 then we want a zero; make e = 0 too, but conserve
+     * the sign.
+     */
+    t = (uint32_t)(m >> 54);
+    e &= -(int)t;
+
+    /*
+     * The 52 mantissa bits come from m. Value m has its top bit set
+     * (unless it is a zero); we leave it "as is": the top bit will
+     * increment the exponent by 1, except when m = 0, which is
+     * exactly what we want.
+     */
+    x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
+
+    /*
+     * Rounding: if the low three bits of m are 011, 110 or 111,
+     * then the value should be incremented to get the next
+     * representable value. This implements the usual
+     * round-to-nearest rule (with preference to even values in case
+     * of a tie). Note that the increment may make a carry spill
+     * into the exponent field, which is again exactly what we want
+     * in that case.
+     */
+    f = (unsigned)m & 7U;
+    x += (0xC8U >> f) & 1;
+    return x;
+}
 
 fpr
 fpr_scaled(int64_t i, int sc) {
@@ -134,7 +194,131 @@ fpr_scaled(int64_t i, int sc) {
     return FPR(s, e, m);
 }
 
+fpr
+fpr_of(int64_t i) {
+    return fpr_scaled(i, 0);
+}
 
+int64_t
+fpr_rint(fpr x) {
+    uint64_t m, d;
+    int e;
+    uint32_t s, dd, f;
+
+    /*
+     * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
+     * thus extract the mantissa as a 63-bit integer, then right-shift
+     * it as needed.
+     */
+    m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    e = 1085 - ((int)(x >> 52) & 0x7FF);
+
+    /*
+     * If a shift of more than 63 bits is needed, then simply set m
+     * to zero. This also covers the case of an input operand equal
+     * to zero.
+     */
+    m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
+    e &= 63;
+
+    /*
+     * Right-shift m as needed. Shift count is e. Proper rounding
+     * mandates that:
+     *   - If the highest dropped bit is zero, then round low.
+     *   - If the highest dropped bit is one, and at least one of the
+     *     other dropped bits is one, then round up.
+     *   - If the highest dropped bit is one, and all other dropped
+     *     bits are zero, then round up if the lowest kept bit is 1,
+     *     or low otherwise (i.e. ties are broken by "rounding to even").
+     *
+     * We thus first extract a word consisting of all the dropped bit
+     * AND the lowest kept bit; then we shrink it down to three bits,
+     * the lowest being "sticky".
+     */
+    d = fpr_ulsh(m, 63 - e);
+    dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
+    f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
+    m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
+
+    /*
+     * Apply the sign bit.
+     */
+    s = (uint32_t)(x >> 63);
+    return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
+}
+
+int64_t
+fpr_floor(fpr x) {
+    uint64_t t;
+    int64_t xi;
+    int e, cc;
+
+    /*
+     * We extract the integer as a _signed_ 64-bit integer with
+     * a scaling factor. Since we assume that the value fits
+     * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
+     * absolute value to make it in the 2^62..2^63-1 range: we
+     * will only need a right-shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    t = x >> 63;
+    xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
+                   & (((uint64_t)1 << 63) - 1));
+    xi = (xi ^ -(int64_t)t) + (int64_t)t;
+    cc = 1085 - e;
+
+    /*
+     * We perform an arithmetic right-shift on the value. This
+     * applies floor() semantics on both positive and negative values
+     * (rounding toward minus infinity).
+     */
+    xi = fpr_irsh(xi, cc & 63);
+
+    /*
+     * If the true shift count was 64 or more, then we should instead
+     * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
+     * case: -0 will be floored to -1, not 0 (whether this is correct
+     * is debatable; in any case, the other functions normalize zero
+     * to +0).
+     *
+     * For an input of zero, the non-shifted xi was incorrect (we used
+     * a top implicit bit of value 1, not 0), but this does not matter
+     * since this operation will clamp it down.
+     */
+    xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
+    return xi;
+}
+
+int64_t
+fpr_trunc(fpr x) {
+    uint64_t t, xu;
+    int e, cc;
+
+    /*
+     * Extract the absolute value. Since we assume that the value
+     * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
+     * the absolute value into the 2^62..2^63-1 range, and then
+     * do a right shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    cc = 1085 - e;
+    xu = fpr_ursh(xu, cc & 63);
+
+    /*
+     * If the exponent is too low (cc > 63), then the shift was wrong
+     * and we must clamp the value to 0. This also covers the case
+     * of an input equal to zero.
+     */
+    xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
+
+    /*
+     * Apply back the sign, if the source value is negative.
+     */
+    t = x >> 63;
+    xu = (xu ^ -t) + t;
+    return *(int64_t *)&xu;
+}
 
 fpr
 fpr_add(fpr x, fpr y) {
@@ -243,7 +427,42 @@ fpr_add(fpr x, fpr y) {
     return FPR(sx, ex, xu);
 }
 
+fpr
+fpr_sub(fpr x, fpr y) {
+    y ^= (uint64_t)1 << 63;
+    return fpr_add(x, y);
+}
 
+fpr
+fpr_neg(fpr x) {
+    x ^= (uint64_t)1 << 63;
+    return x;
+}
+
+fpr
+fpr_half(fpr x) {
+    /*
+     * To divide a value by 2, we just have to subtract 1 from its
+     * exponent, but we have to take care of zero.
+     */
+    uint32_t t;
+
+    x -= (uint64_t)1 << 52;
+    t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
+    x &= (uint64_t)t - 1;
+    return x;
+}
+
+fpr
+fpr_double(fpr x) {
+    /*
+     * To double a value, we just increment by one the exponent. We
+     * don't care about infinites or NaNs; however, 0 is a
+     * special case.
+     */
+    x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
+    return x;
+}
 
 fpr
 fpr_mul(fpr x, fpr y) {
@@ -340,7 +559,10 @@ fpr_mul(fpr x, fpr y) {
     return FPR(s, e, zu);
 }
 
-
+fpr
+fpr_sqr(fpr x) {
+    return fpr_mul(x, x);
+}
 
 fpr
 fpr_div(fpr x, fpr y) {
@@ -428,7 +650,10 @@ fpr_div(fpr x, fpr y) {
     return FPR(s, e, q);
 }
 
-
+fpr
+fpr_inv(fpr x) {
+    return fpr_div(4607182418800017408u, x);
+}
 
 fpr
 fpr_sqrt(fpr x) {
@@ -506,6 +731,37 @@ fpr_sqrt(fpr x) {
     return FPR(0, e, q);
 }
 
+int
+fpr_lt(fpr x, fpr y) {
+    /*
+     * If both x and y are positive, then a signed comparison yields
+     * the proper result:
+     *   - For positive values, the order is preserved.
+     *   - The sign bit is at the same place as in integers, so
+     *     sign is preserved.
+     * Moreover, we can compute [x < y] as sgn(x-y) and the computation
+     * of x-y will not overflow.
+     *
+     * If the signs differ, then sgn(x) gives the proper result.
+     *
+     * If both x and y are negative, then the order is reversed.
+     * Hence [x < y] = sgn(y-x). We must compute this separately from
+     * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
+     * case x = y properly.
+     */
+    int cc0, cc1;
+    int64_t sx;
+    int64_t sy;
+
+    sx = *(int64_t *)&x;
+    sy = *(int64_t *)&y;
+    sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
+
+    cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
+    cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
+
+    return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
+}
 
 uint64_t
 fpr_expm_p63(fpr x, fpr ccs) {
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/fpr.h b/src/sig/falcon/pqclean_falcon-512_clean/fpr.h
index f88595e2c..fb6830e71 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/fpr.h
+++ b/src/sig/falcon/pqclean_falcon-512_clean/fpr.h
@@ -126,11 +126,8 @@ typedef uint64_t fpr;
  *
  * Shift count n MUST be in the 0..63 range.
  */
-static inline uint64_t
-fpr_ursh(uint64_t x, int n) {
-    x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
-    return x >> (n & 31);
-}
+#define fpr_ursh   PQCLEAN_FALCON512_CLEAN_fpr_ursh
+uint64_t fpr_ursh(uint64_t x, int n);
 
 /*
  * Right-shift a 64-bit signed value by a possibly secret shift count
@@ -138,11 +135,8 @@ fpr_ursh(uint64_t x, int n) {
  *
  * Shift count n MUST be in the 0..63 range.
  */
-static inline int64_t
-fpr_irsh(int64_t x, int n) {
-    x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
-    return x >> (n & 31);
-}
+#define fpr_irsh   PQCLEAN_FALCON512_CLEAN_fpr_irsh
+int64_t fpr_irsh(int64_t x, int n);
 
 /*
  * Left-shift a 64-bit unsigned value by a possibly secret shift count
@@ -150,11 +144,8 @@ fpr_irsh(int64_t x, int n) {
  *
  * Shift count n MUST be in the 0..63 range.
  */
-static inline uint64_t
-fpr_ulsh(uint64_t x, int n) {
-    x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
-    return x << (n & 31);
-}
+#define fpr_ulsh   PQCLEAN_FALCON512_CLEAN_fpr_ulsh
+uint64_t fpr_ulsh(uint64_t x, int n);
 
 /*
  * Expectations:
@@ -171,56 +162,15 @@ fpr_ulsh(uint64_t x, int n) {
  * If e >= -1076 and e != 0, m must be within the expected range
  * (2^54 to 2^55-1).
  */
-static inline fpr
-FPR(int s, int e, uint64_t m) {
-    fpr x;
-    uint32_t t;
-    unsigned f;
+#define FPR   PQCLEAN_FALCON512_CLEAN_FPR
+fpr FPR(int s, int e, uint64_t m);
 
-    /*
-     * If e >= -1076, then the value is "normal"; otherwise, it
-     * should be a subnormal, which we clamp down to zero.
-     */
-    e += 1076;
-    t = (uint32_t)e >> 31;
-    m &= (uint64_t)t - 1;
-
-    /*
-     * If m = 0 then we want a zero; make e = 0 too, but conserve
-     * the sign.
-     */
-    t = (uint32_t)(m >> 54);
-    e &= -(int)t;
-
-    /*
-     * The 52 mantissa bits come from m. Value m has its top bit set
-     * (unless it is a zero); we leave it "as is": the top bit will
-     * increment the exponent by 1, except when m = 0, which is
-     * exactly what we want.
-     */
-    x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
-
-    /*
-     * Rounding: if the low three bits of m are 011, 110 or 111,
-     * then the value should be incremented to get the next
-     * representable value. This implements the usual
-     * round-to-nearest rule (with preference to even values in case
-     * of a tie). Note that the increment may make a carry spill
-     * into the exponent field, which is again exactly what we want
-     * in that case.
-     */
-    f = (unsigned)m & 7U;
-    x += (0xC8U >> f) & 1;
-    return x;
-}
 
 #define fpr_scaled   PQCLEAN_FALCON512_CLEAN_fpr_scaled
 fpr fpr_scaled(int64_t i, int sc);
 
-static inline fpr
-fpr_of(int64_t i) {
-    return fpr_scaled(i, 0);
-}
+#define fpr_of   PQCLEAN_FALCON512_CLEAN_fpr_of
+fpr fpr_of(int64_t i);
 
 static const fpr fpr_q = 4667981563525332992;
 static const fpr fpr_inverse_of_q = 4545632735260551042;
@@ -244,217 +194,47 @@ static const fpr fpr_ptwo63m1 = 4890909195324358656;
 static const fpr fpr_mtwo63m1 = 14114281232179134464U;
 static const fpr fpr_ptwo63 = 4890909195324358656;
 
-static inline int64_t
-fpr_rint(fpr x) {
-    uint64_t m, d;
-    int e;
-    uint32_t s, dd, f;
+#define fpr_rint   PQCLEAN_FALCON512_CLEAN_fpr_rint
+int64_t fpr_rint(fpr x);
 
-    /*
-     * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
-     * thus extract the mantissa as a 63-bit integer, then right-shift
-     * it as needed.
-     */
-    m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-    e = 1085 - ((int)(x >> 52) & 0x7FF);
+#define fpr_floor   PQCLEAN_FALCON512_CLEAN_fpr_floor
+int64_t fpr_floor(fpr x);
 
-    /*
-     * If a shift of more than 63 bits is needed, then simply set m
-     * to zero. This also covers the case of an input operand equal
-     * to zero.
-     */
-    m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
-    e &= 63;
-
-    /*
-     * Right-shift m as needed. Shift count is e. Proper rounding
-     * mandates that:
-     *   - If the highest dropped bit is zero, then round low.
-     *   - If the highest dropped bit is one, and at least one of the
-     *     other dropped bits is one, then round up.
-     *   - If the highest dropped bit is one, and all other dropped
-     *     bits are zero, then round up if the lowest kept bit is 1,
-     *     or low otherwise (i.e. ties are broken by "rounding to even").
-     *
-     * We thus first extract a word consisting of all the dropped bit
-     * AND the lowest kept bit; then we shrink it down to three bits,
-     * the lowest being "sticky".
-     */
-    d = fpr_ulsh(m, 63 - e);
-    dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
-    f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
-    m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
-
-    /*
-     * Apply the sign bit.
-     */
-    s = (uint32_t)(x >> 63);
-    return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
-}
-
-static inline int64_t
-fpr_floor(fpr x) {
-    uint64_t t;
-    int64_t xi;
-    int e, cc;
-
-    /*
-     * We extract the integer as a _signed_ 64-bit integer with
-     * a scaling factor. Since we assume that the value fits
-     * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
-     * absolute value to make it in the 2^62..2^63-1 range: we
-     * will only need a right-shift afterwards.
-     */
-    e = (int)(x >> 52) & 0x7FF;
-    t = x >> 63;
-    xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
-                   & (((uint64_t)1 << 63) - 1));
-    xi = (xi ^ -(int64_t)t) + (int64_t)t;
-    cc = 1085 - e;
-
-    /*
-     * We perform an arithmetic right-shift on the value. This
-     * applies floor() semantics on both positive and negative values
-     * (rounding toward minus infinity).
-     */
-    xi = fpr_irsh(xi, cc & 63);
-
-    /*
-     * If the true shift count was 64 or more, then we should instead
-     * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
-     * case: -0 will be floored to -1, not 0 (whether this is correct
-     * is debatable; in any case, the other functions normalize zero
-     * to +0).
-     *
-     * For an input of zero, the non-shifted xi was incorrect (we used
-     * a top implicit bit of value 1, not 0), but this does not matter
-     * since this operation will clamp it down.
-     */
-    xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
-    return xi;
-}
-
-static inline int64_t
-fpr_trunc(fpr x) {
-    uint64_t t, xu;
-    int e, cc;
-
-    /*
-     * Extract the absolute value. Since we assume that the value
-     * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
-     * the absolute value into the 2^62..2^63-1 range, and then
-     * do a right shift afterwards.
-     */
-    e = (int)(x >> 52) & 0x7FF;
-    xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-    cc = 1085 - e;
-    xu = fpr_ursh(xu, cc & 63);
-
-    /*
-     * If the exponent is too low (cc > 63), then the shift was wrong
-     * and we must clamp the value to 0. This also covers the case
-     * of an input equal to zero.
-     */
-    xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
-
-    /*
-     * Apply back the sign, if the source value is negative.
-     */
-    t = x >> 63;
-    xu = (xu ^ -t) + t;
-    return *(int64_t *)&xu;
-}
+#define fpr_trunc   PQCLEAN_FALCON512_CLEAN_fpr_trunc
+int64_t fpr_trunc(fpr x);
 
 #define fpr_add   PQCLEAN_FALCON512_CLEAN_fpr_add
 fpr fpr_add(fpr x, fpr y);
 
-static inline fpr
-fpr_sub(fpr x, fpr y) {
-    y ^= (uint64_t)1 << 63;
-    return fpr_add(x, y);
-}
+#define fpr_sub   PQCLEAN_FALCON512_CLEAN_fpr_sub
+fpr fpr_sub(fpr x, fpr y);
 
-static inline fpr
-fpr_neg(fpr x) {
-    x ^= (uint64_t)1 << 63;
-    return x;
-}
+#define fpr_neg   PQCLEAN_FALCON512_CLEAN_fpr_neg
+fpr fpr_neg(fpr x);
 
-static inline fpr
-fpr_half(fpr x) {
-    /*
-     * To divide a value by 2, we just have to subtract 1 from its
-     * exponent, but we have to take care of zero.
-     */
-    uint32_t t;
+#define fpr_half   PQCLEAN_FALCON512_CLEAN_fpr_half
+fpr fpr_half(fpr x);
 
-    x -= (uint64_t)1 << 52;
-    t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
-    x &= (uint64_t)t - 1;
-    return x;
-}
-
-static inline fpr
-fpr_double(fpr x) {
-    /*
-     * To double a value, we just increment by one the exponent. We
-     * don't care about infinites or NaNs; however, 0 is a
-     * special case.
-     */
-    x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
-    return x;
-}
+#define fpr_double   PQCLEAN_FALCON512_CLEAN_fpr_double
+fpr fpr_double(fpr x);
 
 #define fpr_mul   PQCLEAN_FALCON512_CLEAN_fpr_mul
 fpr fpr_mul(fpr x, fpr y);
 
-static inline fpr
-fpr_sqr(fpr x) {
-    return fpr_mul(x, x);
-}
+#define fpr_sqr   PQCLEAN_FALCON512_CLEAN_fpr_sqr
+fpr fpr_sqr(fpr x);
 
 #define fpr_div   PQCLEAN_FALCON512_CLEAN_fpr_div
 fpr fpr_div(fpr x, fpr y);
 
-static inline fpr
-fpr_inv(fpr x) {
-    return fpr_div(4607182418800017408u, x);
-}
+#define fpr_inv   PQCLEAN_FALCON512_CLEAN_fpr_inv
+fpr fpr_inv(fpr x);
 
 #define fpr_sqrt   PQCLEAN_FALCON512_CLEAN_fpr_sqrt
 fpr fpr_sqrt(fpr x);
 
-static inline int
-fpr_lt(fpr x, fpr y) {
-    /*
-     * If both x and y are positive, then a signed comparison yields
-     * the proper result:
-     *   - For positive values, the order is preserved.
-     *   - The sign bit is at the same place as in integers, so
-     *     sign is preserved.
-     * Moreover, we can compute [x < y] as sgn(x-y) and the computation
-     * of x-y will not overflow.
-     *
-     * If the signs differ, then sgn(x) gives the proper result.
-     *
-     * If both x and y are negative, then the order is reversed.
-     * Hence [x < y] = sgn(y-x). We must compute this separately from
-     * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
-     * case x = y properly.
-     */
-    int cc0, cc1;
-    int64_t sx;
-    int64_t sy;
-
-    sx = *(int64_t *)&x;
-    sy = *(int64_t *)&y;
-    sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
-
-    cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
-    cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
-
-    return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
-}
+#define fpr_lt   PQCLEAN_FALCON512_CLEAN_fpr_lt
+int fpr_lt(fpr x, fpr y);
 
 /*
  * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/inner.c b/src/sig/falcon/pqclean_falcon-512_clean/inner.c
new file mode 100755
index 000000000..dd90bd57e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-512_clean/inner.c
@@ -0,0 +1,70 @@
+#include "inner.h"
+
+/*
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ */
+
+unsigned set_fpu_cw(unsigned x) {
+    return x;
+}
+
+
+uint64_t prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCON512_CLEAN_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+
+unsigned prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCON512_CLEAN_prng_refill(p);
+    }
+    return v;
+}
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/inner.h b/src/sig/falcon/pqclean_falcon-512_clean/inner.h
index b81197f1c..d469c9237 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/inner.h
+++ b/src/sig/falcon/pqclean_falcon-512_clean/inner.h
@@ -99,13 +99,8 @@
  * targets other than 32-bit x86, or when the native 'double' type is
  * not used, the set_fpu_cw() function does nothing at all.
  */
-static inline unsigned
-set_fpu_cw(unsigned x) {
-    return x;
-}
-
-
-
+#define set_fpu_cw PQCLEAN_FALCON512_CLEAN_set_fpu_cw
+unsigned set_fpu_cw(unsigned x);
 
 /* ==================================================================== */
 /*
@@ -496,46 +491,14 @@ void PQCLEAN_FALCON512_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len);
 /*
  * Get a 64-bit random value from a PRNG.
  */
-static inline uint64_t
-prng_get_u64(prng *p) {
-    size_t u;
-
-    /*
-     * If there are less than 9 bytes in the buffer, we refill it.
-     * This means that we may drop the last few bytes, but this allows
-     * for faster extraction code. Also, it means that we never leave
-     * an empty buffer.
-     */
-    u = p->ptr;
-    if (u >= (sizeof p->buf.d) - 9) {
-        PQCLEAN_FALCON512_CLEAN_prng_refill(p);
-        u = 0;
-    }
-    p->ptr = u + 8;
-
-    return (uint64_t)p->buf.d[u + 0]
-           | ((uint64_t)p->buf.d[u + 1] << 8)
-           | ((uint64_t)p->buf.d[u + 2] << 16)
-           | ((uint64_t)p->buf.d[u + 3] << 24)
-           | ((uint64_t)p->buf.d[u + 4] << 32)
-           | ((uint64_t)p->buf.d[u + 5] << 40)
-           | ((uint64_t)p->buf.d[u + 6] << 48)
-           | ((uint64_t)p->buf.d[u + 7] << 56);
-}
+#define prng_get_u64 PQCLEAN_FALCON512_CLEAN_prng_get_u64
+uint64_t prng_get_u64(prng *p);
 
 /*
  * Get an 8-bit random value from a PRNG.
  */
-static inline unsigned
-prng_get_u8(prng *p) {
-    unsigned v;
-
-    v = p->buf.d[p->ptr ++];
-    if (p->ptr == sizeof p->buf.d) {
-        PQCLEAN_FALCON512_CLEAN_prng_refill(p);
-    }
-    return v;
-}
+#define prng_get_u8 PQCLEAN_FALCON512_CLEAN_prng_get_u8
+unsigned prng_get_u8(prng *p);
 
 /* ==================================================================== */
 /*
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/sign.c b/src/sig/falcon/pqclean_falcon-512_clean/sign.c
index 87566d985..469ae3b42 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/sign.c
+++ b/src/sig/falcon/pqclean_falcon-512_clean/sign.c
@@ -267,7 +267,7 @@ PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key,
     PQCLEAN_FALCON512_CLEAN_poly_neg(rF, logn);
 
     /*
-     * The Gram matrix is G = B·B*. Formulas are:
+     * The Gram matrix is G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
@@ -788,7 +788,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     PQCLEAN_FALCON512_CLEAN_poly_neg(b11, logn);
 
     /*
-     * Compute the Gram matrix G = B·B*. Formulas are:
+     * Compute the Gram matrix G = B x B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)