From 7fcf29d37f5870a06aba18c72c090af96f483a82 Mon Sep 17 00:00:00 2001 From: Christian Paquin Date: Wed, 23 Nov 2016 11:06:54 -0500 Subject: [PATCH] Integrates MSR's RLWE Latticecrypto library into OQS. (#58) --- Makefile | 9 +- README.md | 8 +- VisualStudio/oqs/oqs.vcxproj | 10 + VisualStudio/oqs/oqs.vcxproj.filters | 240 +++--- src/kex/kex.c | 3 + src/kex/kex.h | 1 + src/kex/test_kex.c | 1 + src/kex_rlwe_msrln16/AMD64/consts.c | 40 + src/kex_rlwe_msrln16/AMD64/error_asm.S | 436 ++++++++++ src/kex_rlwe_msrln16/AMD64/ntt_x64.c | 65 ++ src/kex_rlwe_msrln16/AMD64/ntt_x64_asm.S | 979 ++++++++++++++++++++++ src/kex_rlwe_msrln16/LatticeCrypto.h | 94 +++ src/kex_rlwe_msrln16/LatticeCrypto_kex.c | 452 ++++++++++ src/kex_rlwe_msrln16/LatticeCrypto_priv.h | 122 +++ src/kex_rlwe_msrln16/License.txt | 25 + src/kex_rlwe_msrln16/README.txt | 42 + src/kex_rlwe_msrln16/external/shake128.c | 332 ++++++++ src/kex_rlwe_msrln16/external/shake128.h | 24 + src/kex_rlwe_msrln16/generic/ntt.c | 182 ++++ src/kex_rlwe_msrln16/kex_rlwe_msrln16.c | 165 ++++ src/kex_rlwe_msrln16/kex_rlwe_msrln16.h | 24 + src/kex_rlwe_msrln16/ntt_constants.c | 145 ++++ 22 files changed, 3270 insertions(+), 129 deletions(-) create mode 100644 src/kex_rlwe_msrln16/AMD64/consts.c create mode 100644 src/kex_rlwe_msrln16/AMD64/error_asm.S create mode 100644 src/kex_rlwe_msrln16/AMD64/ntt_x64.c create mode 100644 src/kex_rlwe_msrln16/AMD64/ntt_x64_asm.S create mode 100644 src/kex_rlwe_msrln16/LatticeCrypto.h create mode 100644 src/kex_rlwe_msrln16/LatticeCrypto_kex.c create mode 100644 src/kex_rlwe_msrln16/LatticeCrypto_priv.h create mode 100644 src/kex_rlwe_msrln16/License.txt create mode 100644 src/kex_rlwe_msrln16/README.txt create mode 100644 src/kex_rlwe_msrln16/external/shake128.c create mode 100644 src/kex_rlwe_msrln16/external/shake128.h create mode 100644 src/kex_rlwe_msrln16/generic/ntt.c create mode 100644 src/kex_rlwe_msrln16/kex_rlwe_msrln16.c create mode 100644 src/kex_rlwe_msrln16/kex_rlwe_msrln16.h create mode 100644 src/kex_rlwe_msrln16/ntt_constants.c diff --git a/Makefile b/Makefile index 6305275c2..950bf8683 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,7 @@ links: $(LN) ../../src/kex/kex.h include/oqs $(LN) ../../src/kex_rlwe_bcns15/kex_rlwe_bcns15.h include/oqs $(LN) ../../src/kex_rlwe_newhope/kex_rlwe_newhope.h include/oqs + $(LN) ../../src/kex_rlwe_msrln16/kex_rlwe_msrln16.h include/oqs $(LN) ../../src/kex_lwe_frodo/kex_lwe_frodo.h include/oqs $(LN) ../../src/rand/rand.h include/oqs $(LN) ../../src/rand_urandom_chacha20/rand_urandom_chacha20.h include/oqs @@ -85,6 +86,11 @@ KEX_RLWE_NEWHOPE_OBJS := $(addprefix objs/kex_rlwe_newhope/, kex_rlwe_newhope.o) KEX_RLWE_NEWHOPE_HEADERS := $(addprefix src/kex_rlwe_newhope/, kex_rlwe_newhope.h fips202.c newhope.c params.h poly.c precomp.c) $(KEX_RLWE_NEWHOPE_OBJS): $(KEX_RLWE_NEWHOPE_HEADERS) +# KEX_RLWE_MSRLN16 +KEX_RLWE_MSRLN16_OBJS := $(addprefix objs/kex_rlwe_msrln16/, kex_rlwe_msrln16.o LatticeCrypto_kex.o ntt_constants.o) +KEX_RLWE_MSRLN16_HEADERS := $(addprefix src/kex_rlwe_msrln16/, LatticeCrypto.h LatticeCrypto_priv.h kex_rlwe_msrln16.h ) +$(KEX_RLWE_MSRLN16_OBJS): $(KEX_RLWE_MSRLN16_HEADERS) + # KEX_LWE_FRODO KEX_LWE_FRODO_OBJS := $(addprefix objs/kex_lwe_frodo/, lwe.o kex_lwe_frodo.o lwe_noise.o) KEX_LWE_FRODO_HEADERS := $(addprefix src/kex_lwe_frodo/, kex_lwe_frodo.h local.h) @@ -106,9 +112,10 @@ objs/kex/kex.o: src/kex/kex.h # LIB + RAND_OBJS := $(RAND_URANDOM_AESCTR_OBJS) $(RAND_URANDOM_CHACHA_OBJS) -lib: $(RAND_OBJS) $(KEX_RLWE_BCNS15_OBJS) $(KEX_RLWE_NEWHOPE_OBJS) $(KEX_LWE_FRODO_OBJS) objs/rand/rand.o objs/kex/kex.o $(AES_OBJS) $(COMMON_OBJS) +lib: $(RAND_OBJS) $(KEX_RLWE_BCNS15_OBJS) $(KEX_RLWE_NEWHOPE_OBJS) $(KEX_LWE_FRODO_OBJS) $(KEX_RLWE_MSRLN16_OBJS) objs/rand/rand.o objs/kex/kex.o $(AES_OBJS) $(COMMON_OBJS) rm -f liboqs.a $(AR) liboqs.a $^ $(RANLIB) liboqs.a diff --git a/README.md b/README.md index 5aeca9be5..13fce501c 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ liboqs currently contains: - `rand_urandom_chacha20`: pseudorandom number generator seeded from /dev/urandom and expanded using the ChaCha20 stream cipher - `kex_rlwe_bcns15`: key exchange from the ring learning with errors problem (Bos, Costello, Naehrig, Stebila, *IEEE Symposium on Security & Privacy 2015*, [https://eprint.iacr.org/2014/599](https://eprint.iacr.org/2014/599)) - `kex_rlwe_newhope`: "NewHope": key exchange from the ring learning with errors problem (Alkim, Ducas, Pöppelmann, Schwabe, *USENIX Security 2016*, [https://eprint.iacr.org/2015/1092](https://eprint.iacr.org/2015/1092)) (using the reference C implementation of NewHope from [https://github.com/tpoeppelmann/newhope](https://github.com/tpoeppelmann/newhope)) +- `kex_rlwe_msrln16`: MSR implementation of Peikert's RLWE key exchange, based on the implementation of Alkim, Ducas, Pöppelmann, and Schwabe, with improvements from Longa and Naehrig [https://www.microsoft.com/en-us/research/project/lattice-cryptography-library/](https://www.microsoft.com/en-us/research/project/lattice-cryptography-library/) - `kex_lwe_frodo`: key exchange from the learning with errors problem (Bos, Costello, Ducas, Mironov, Naehrig, Nikolaenko, Raghunathan, Stebila, *ACM Conference on Computer and Communications Security 2016*, [http://eprint.iacr.org/2016/659](http://eprint.iacr.org/2016/659)) Building and Running @@ -114,11 +115,12 @@ In the long term, we are also interested in including post-quantum signature sch License ------- -liboqs is licensed under the MIT License; see [https://github.com/open-quantum-safe/liboqs/blob/master/LICENSE.txt](LICENSE.txt) for details. liboqs includes some third party libraries or modules that are licensed differently; the corresponding subfolder contains the license that applies in that case. In particular: +liboqs is licensed under the MIT License; see [LICENSE.txt](https://github.com/open-quantum-safe/liboqs/blob/master/LICENSE.txt) for details. liboqs includes some third party libraries or modules that are licensed differently; the corresponding subfolder contains the license that applies in that case. In particular: -- `src/kex_rlwe_bcns15`: public domain ([http://unlicense.org](http://unlicense.org)) -- `src/rand_urandom_chacha20/external`: public domain +- `src/kex_rlwe_bcns15`: public domain ([Unlicense](http://unlicense.org)) +- `src/kex_rlwe_msrln16/external`: public domain ([CC0](http://creativecommons.org/publicdomain/zero/1.0/)) - `src/kex_rlwe_newhope`: public domain +- `src/rand_urandom_chacha20/external`: public domain Team ---- diff --git a/VisualStudio/oqs/oqs.vcxproj b/VisualStudio/oqs/oqs.vcxproj index d686f2ed7..c881e5206 100644 --- a/VisualStudio/oqs/oqs.vcxproj +++ b/VisualStudio/oqs/oqs.vcxproj @@ -29,6 +29,9 @@ + + + @@ -47,6 +50,9 @@ + + + @@ -134,6 +140,7 @@ copy "$(SolutionDir)..\src\kex\kex.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\aes\aes.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_bcns15\kex_rlwe_bcns15.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_newhope\kex_rlwe_newhope.h" "$(SolutionDir)include\oqs\" +copy "$(SolutionDir)..\src\kex_rlwe_msrln16\kex_rlwe_msrln16.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_lwe_frodo\kex_lwe_frodo.h" "$(SolutionDir)include\oqs\" @@ -161,6 +168,7 @@ copy "$(SolutionDir)..\src\kex\kex.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\aes\aes.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_bcns15\kex_rlwe_bcns15.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_newhope\kex_rlwe_newhope.h" "$(SolutionDir)include\oqs\" +copy "$(SolutionDir)..\src\kex_rlwe_msrln16\kex_rlwe_msrln16.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_lwe_frodo\kex_lwe_frodo.h" "$(SolutionDir)include\oqs\" @@ -192,6 +200,7 @@ copy "$(SolutionDir)..\src\kex\kex.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\aes\aes.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_bcns15\kex_rlwe_bcns15.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_newhope\kex_rlwe_newhope.h" "$(SolutionDir)include\oqs\" +copy "$(SolutionDir)..\src\kex_rlwe_msrln16\kex_rlwe_msrln16.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_lwe_frodo\kex_lwe_frodo.h" "$(SolutionDir)include\oqs\" @@ -223,6 +232,7 @@ copy "$(SolutionDir)..\src\kex\kex.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\aes\aes.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_bcns15\kex_rlwe_bcns15.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_rlwe_newhope\kex_rlwe_newhope.h" "$(SolutionDir)include\oqs\" +copy "$(SolutionDir)..\src\kex_rlwe_msrln16\kex_rlwe_msrln16.h" "$(SolutionDir)include\oqs\" copy "$(SolutionDir)..\src\kex_lwe_frodo\kex_lwe_frodo.h" "$(SolutionDir)include\oqs\" diff --git a/VisualStudio/oqs/oqs.vcxproj.filters b/VisualStudio/oqs/oqs.vcxproj.filters index a8b6055c0..d8bdc128b 100644 --- a/VisualStudio/oqs/oqs.vcxproj.filters +++ b/VisualStudio/oqs/oqs.vcxproj.filters @@ -1,138 +1,128 @@  - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hh;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - {178cf83e-52eb-4ead-a1ca-33558ffd988e} - - - {15505124-034e-4100-a6cb-a32309738b25} - - - {768db348-be19-4ce5-a6bc-22d81c2c59d3} - - - {65721ee8-a809-4e67-88a8-ad33d6a6a800} - - - {d26b84bb-a12a-4229-b619-8ddfdaca9928} - - - {9445c3c6-bd2e-4448-8cba-a986ef929d45} - - - {9e41843f-5671-4bbf-916a-2688ac3ceaff} - - - {fb1ab057-5f38-445f-9bfa-2486aa8200e5} - - - {a6bb776b-ac51-4243-aec8-396a52ed9560} - - - {379cec65-7e0e-4eb2-9be1-414b154a911f} - - - - - Header Files - - - Header Files\BCNS15 - - - Header Files\rand - - - Header Files\rand - - - Header Files\BCNS15 - - - Header Files\BCNS15 - - - Header Files\BCNS15 - - - Header Files\Newhope - - - Header Files\Newhope - - - Header Files\Frodo - - - Header Files\Frodo - - - Header Files\AES - - - Header Files\rand - - - - - Source Files - - - Source Files\BCNS15 - - - Source Files\BCNS15 - - - Source Files\rand - - - Source Files\rand - - - Source Files\BCNS15 - - - Source Files\BCNS15 - - - Source Files\Newhope - - - Source Files\Newhope - - - Source Files\Frodo - - - Source Files\Frodo - - - Source Files\Frodo - + - Source Files\AES + AES - Source Files\AES + AES - Source Files\AES + AES + + + BCNS15 + + + Frodo + + + BCNS15 + + + MSR LN16 + + + NewHope + + + MSR LN16 + + + Frodo + + + Frodo + + + NewHope + + + MSR LN16 + + + Rand - Source Files\rand + Rand + + + Rand + + + BCNS15 + + + BCNS15 + + + + AES + + + Frodo + + + BCNS15 + + + MSR LN16 + + + NewHope + + + MSR LN16 + + + MSR LN16 + + + BCNS15 + + + Frodo + + + NewHope + + + Rand + + + Rand + + + Rand + + + BCNS15 + + + BCNS15 + + + + + {71c917ec-9181-4b88-bdfc-9611ee1abe9a} + + + {6bfff158-3e78-402f-ba16-e8d315089de8} + + + {d0291785-4232-4264-b1bd-08b7e3f8df5e} + + + {ab581356-2a96-4211-99e3-f5cecd92eda3} + + + {fd44eb34-2f81-411e-a55f-f279c4b101de} + + + {9f5ed87f-ed1e-47b4-b7e7-1d6648cb88fd} + + \ No newline at end of file diff --git a/src/kex/kex.c b/src/kex/kex.c index 459ef8b38..aa3a717c9 100644 --- a/src/kex/kex.c +++ b/src/kex/kex.c @@ -3,6 +3,7 @@ #include #include #include +#include #include OQS_KEX *OQS_KEX_new(OQS_RAND *rand, enum OQS_KEX_alg_name alg_name, const uint8_t *seed, const size_t seed_len, const char *named_parameters) { @@ -11,6 +12,8 @@ OQS_KEX *OQS_KEX_new(OQS_RAND *rand, enum OQS_KEX_alg_name alg_name, const uint8 return OQS_KEX_rlwe_bcns15_new(rand); case OQS_KEX_alg_rlwe_bcns15: return OQS_KEX_rlwe_bcns15_new(rand); + case OQS_KEX_alg_rlwe_msrln16: + return OQS_KEX_rlwe_msrln16_new(rand); case OQS_KEX_alg_rlwe_newhope: return OQS_KEX_rlwe_newhope_new(rand); case OQS_KEX_alg_lwe_frodo: diff --git a/src/kex/kex.h b/src/kex/kex.h index 06befd197..9dcaf0291 100644 --- a/src/kex/kex.h +++ b/src/kex/kex.h @@ -15,6 +15,7 @@ enum OQS_KEX_alg_name { OQS_KEX_alg_default, OQS_KEX_alg_rlwe_bcns15, OQS_KEX_alg_rlwe_newhope, + OQS_KEX_alg_rlwe_msrln16, OQS_KEX_alg_lwe_frodo, }; diff --git a/src/kex/test_kex.c b/src/kex/test_kex.c index 9cd523a36..136dfa783 100644 --- a/src/kex/test_kex.c +++ b/src/kex/test_kex.c @@ -21,6 +21,7 @@ struct kex_testcase { struct kex_testcase kex_testcases[] = { { OQS_KEX_alg_rlwe_bcns15, NULL, 0, NULL, "rlwe_bcns15", 0 }, { OQS_KEX_alg_rlwe_newhope, NULL, 0, NULL, "rlwe_newhope", 0 }, + { OQS_KEX_alg_rlwe_msrln16, NULL, 0, NULL, "rlwe_msrln16", 0 }, { OQS_KEX_alg_lwe_frodo, (unsigned char *) "01234567890123456", 16, "recommended", "lwe_frodo_recommended", 0 }, }; diff --git a/src/kex_rlwe_msrln16/AMD64/consts.c b/src/kex_rlwe_msrln16/AMD64/consts.c new file mode 100644 index 000000000..2f1bc97de --- /dev/null +++ b/src/kex_rlwe_msrln16/AMD64/consts.c @@ -0,0 +1,40 @@ +/**************************************************************************************** +* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: constants for the x64 assembly implementation +* +*****************************************************************************************/ + +#include "../LatticeCrypto_priv.h" +#include + + +uint32_t PRIME8x[8] = {OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_Q}; +uint8_t ONE32x[32] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; +uint32_t MASK12x8[8] = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff}; +uint32_t PERM0246[4] = {0,2,4,6}; +uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6}; +uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7}; +uint64_t PERM0145[4] = {0,1,4,5}; +uint64_t PERM2367[4] = {2,3,6,7}; +uint64_t MASK32[4] = {0xffffffff,0,0xffffffff,0}; +uint64_t MASK42[4] = {0x3fff0000000,0,0x3fff0000000,0}; + +uint64_t MASK14_1[4] = {0x3fff,0,0x3fff,0}; +uint64_t MASK14_2[4] = {0xFFFC000,0,0xFFFC000,0}; +uint64_t MASK14_3[4] = {0x3FFF0000000,0,0x3FFF0000000,0}; +uint64_t MASK14_4[4] = {0xFFFC0000000000,0,0xFFFC0000000000,0}; + +uint32_t ONE8x[8] = {1,1,1,1,1,1,1,1}; +uint32_t THREE8x[8] = {3,3,3,3,3,3,3,3}; +uint32_t FOUR8x[8] = {4,4,4,4,4,4,4,4}; +uint32_t PARAM_Q4x8[8] = {3073,3073,3073,3073,3073,3073,3073,3073}; +uint32_t PARAM_3Q4x8[8] = {9217,9217,9217,9217,9217,9217,9217,9217}; +uint32_t PARAM_5Q4x8[8] = {15362,15362,15362,15362,15362,15362,15362,15362}; +uint32_t PARAM_7Q4x8[8] = {21506,21506,21506,21506,21506,21506,21506,21506}; +uint32_t PARAM_Q2x8[8] = {6145,6145,6145,6145,6145,6145,6145,6145}; +uint32_t PARAM_3Q2x8[8] = {18434,18434,18434,18434,18434,18434,18434,18434}; + diff --git a/src/kex_rlwe_msrln16/AMD64/error_asm.S b/src/kex_rlwe_msrln16/AMD64/error_asm.S new file mode 100644 index 000000000..83baf4158 --- /dev/null +++ b/src/kex_rlwe_msrln16/AMD64/error_asm.S @@ -0,0 +1,436 @@ +//**************************************************************************************** +// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// Abstract: functions for error sampling and reconciliation in x64 assembly using AVX2 +// vector instructions for Linux +// +//**************************************************************************************** + +.intel_syntax noprefix + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx +#define reg_p4 rcx +#define reg_p5 r8 + + +.text +//*********************************************************************** +// Error sampling from psi_12 +// Operation: c [reg_p2] <- sampling(a) [reg_p1] +//*********************************************************************** +.global oqs_rlwe_msrln16_error_sampling_asm +oqs_rlwe_msrln16_error_sampling_asm: + vmovdqu ymm7, ONE32x + movq r11, 384 + movq r10, 32 + movq r8, 24 + xor rax, rax + xor rcx, rcx +loop1: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // sample + vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+32] // sample + vmovdqu ymm4, YMMWORD PTR [reg_p1+4*rax+64] // sample + movq r9, 2 + +loop1b: + vpand ymm1, ymm0, ymm7 // Collecting 8 bits for first sample + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + + vpand ymm3, ymm2, ymm7 // Adding next 4 bits + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm1, ymm1, ymm3 + + vpsrlw ymm2, ymm2, 1 // Collecting 4-bits for second sample + vpand ymm5, ymm2, ymm7 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm5, ymm5, ymm3 + + vpand ymm3, ymm4, ymm7 // Adding next 8 bits + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + + vpsubb ymm5, ymm1, ymm5 + vpermq ymm3, ymm5, 0x0e + vpmovsxbd ymm6, xmm5 + vpsrldq ymm5, ymm5, 8 + vpmovsxbd ymm7, xmm5 + vpmovsxbd ymm8, xmm3 + vpsrldq ymm3, ymm3, 8 + vpmovsxbd ymm9, xmm3 + vmovdqu YMMWORD PTR [reg_p2+4*rcx], ymm6 + vmovdqu YMMWORD PTR [reg_p2+4*rcx+32], ymm7 + vmovdqu YMMWORD PTR [reg_p2+4*rcx+64], ymm8 + vmovdqu YMMWORD PTR [reg_p2+4*rcx+96], ymm9 + + add rcx, r10 // i+32 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm4, ymm4, 1 + dec r9 + jnz loop1b + + add rax, r8 // j+24 + cmp rax, r11 + jl loop1 + ret + + +//*********************************************************************** +// Reconciliation helper function +// Operation: c [reg_p2] <- function(a) [reg_p1] +// [reg_p3] points to random bits +//*********************************************************************** +.global oqs_rlwe_msrln16_helprec_asm +oqs_rlwe_msrln16_helprec_asm: + vmovdqu ymm8, ONE8x + movq r11, 256 + movq r10, 8 + xor rax, rax + vmovdqu ymm4, YMMWORD PTR [reg_p3] // rbits +loop2: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // x + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*rax+4*256] // x+256 + vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+4*512] // x+512 + vmovdqu ymm3, YMMWORD PTR [reg_p1+4*rax+4*768] // x+768 + + vpand ymm5, ymm4, ymm8 // Collecting 8 random bits + vpslld ymm0, ymm0, 1 // 2*x - rbits + vpslld ymm1, ymm1, 1 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm0, ymm0, ymm5 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm5 + vpsubd ymm3, ymm3, ymm5 + + vmovdqu ymm15, PARAM_Q4x8 + vmovdqu ymm7, FOUR8x + vmovdqu ymm8, ymm7 + vmovdqu ymm9, ymm7 + vmovdqu ymm10, ymm7 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 + vmovdqu ymm15, PARAM_3Q4x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 + vmovdqu ymm15, PARAM_5Q4x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 + vmovdqu ymm15, PARAM_7Q4x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 // v0[0] + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 // v0[1] + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 // v0[2] + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 // v0[3] + + vmovdqu ymm15, PARAM_Q2x8 + vmovdqu ymm11, THREE8x + vmovdqu ymm12, ymm11 + vmovdqu ymm13, ymm11 + vmovdqu ymm14, ymm11 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm11, ymm11, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm12, ymm12, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm13, ymm13, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm14, ymm14, ymm6 + vmovdqu ymm15, PARAM_3Q2x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm11, ymm11, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm12, ymm12, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm13, ymm13, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm14, ymm14, ymm6 + vmovdqu ymm15, PRIME8x + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm11, ymm11, ymm6 // v1[0] + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm12, ymm12, ymm6 // v1[1] + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm13, ymm13, ymm6 // v1[2] + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm14, ymm14, ymm6 // v1[3] + + vpmulld ymm6, ymm7, ymm15 + vpslld ymm0, ymm0, 1 + vpsubd ymm0, ymm0, ymm6 + vpabsd ymm0, ymm0 + vpmulld ymm6, ymm8, ymm15 + vpslld ymm1, ymm1, 1 + vpsubd ymm1, ymm1, ymm6 + vpabsd ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm6, ymm9, ymm15 + vpslld ymm2, ymm2, 1 + vpsubd ymm2, ymm2, ymm6 + vpabsd ymm2, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm6, ymm10, ymm15 + vpslld ymm3, ymm3, 1 + vpsubd ymm3, ymm3, ymm6 + vpabsd ymm3, ymm3 + vpaddd ymm0, ymm0, ymm3 // norm + vpsubd ymm0, ymm0, ymm15 + vpsrad ymm0, ymm0, 31 // If norm < q then norm = 0xff...ff, else norm = 0 + + vpxor ymm7, ymm7, ymm11 // v0[i] = (norm & (v0[i] ^ v1[i])) ^ v1[i] + vpand ymm7, ymm7, ymm0 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpand ymm8, ymm8, ymm0 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + vpand ymm9, ymm9, ymm0 + vpxor ymm9, ymm9, ymm13 + vpxor ymm10, ymm10, ymm14 + vpand ymm10, ymm10, ymm0 + vpxor ymm10, ymm10, ymm14 + + vmovdqu ymm15, THREE8x + vmovdqu ymm14, ONE8x + vpsubd ymm7, ymm7, ymm10 + vpand ymm7, ymm7, ymm15 + vpsubd ymm8, ymm8, ymm10 + vpand ymm8, ymm8, ymm15 + vpsubd ymm9, ymm9, ymm10 + vpand ymm9, ymm9, ymm15 + vpslld ymm10, ymm10, 1 + vpxor ymm0, ymm0, ymm14 + vpand ymm0, ymm0, ymm14 + vpaddd ymm10, ymm0, ymm10 + vpand ymm10, ymm10, ymm15 + + vpsrld ymm4, ymm4, 1 + vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm7 + vmovdqu YMMWORD PTR [reg_p2+4*rax+4*256], ymm8 + vmovdqu YMMWORD PTR [reg_p2+4*rax+4*512], ymm9 + vmovdqu YMMWORD PTR [reg_p2+4*rax+4*768], ymm10 + + add rax, r10 // j+8 + add rcx, r9 + cmp rax, r11 + jl loop2 + ret + + +//*********************************************************************** +// Reconciliation function +// Operation: c [reg_p3] <- function(a [reg_p1], b [reg_p2]) +//*********************************************************************** +.global oqs_rlwe_msrln16_rec_asm +oqs_rlwe_msrln16_rec_asm: + vpxor ymm12, ymm12, ymm12 + vmovdqu ymm15, PRIME8x + vpslld ymm14, ymm15, 2 // 4*Q + vpslld ymm13, ymm15, 3 // 8*Q + vpsubd ymm12, ymm12, ymm13 // -8*Q + vpxor ymm11, ymm12, ymm13 // 8*Q ^ -8*Q + vmovdqu ymm10, ONE8x + movq r11, 256 + movq r10, 8 + xor rax, rax + xor rcx, rcx +loop3: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // x + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*rax+4*256] // x+256 + vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+4*512] // x+512 + vmovdqu ymm3, YMMWORD PTR [reg_p1+4*rax+4*768] // x+768 + vmovdqu ymm4, YMMWORD PTR [reg_p2+4*rax] // rvec + vmovdqu ymm5, YMMWORD PTR [reg_p2+4*rax+4*256] // rvec+256 + vmovdqu ymm6, YMMWORD PTR [reg_p2+4*rax+4*512] // rvec+512 + vmovdqu ymm7, YMMWORD PTR [reg_p2+4*rax+4*768] // rvec+768 + + vpslld ymm8, ymm4, 1 // 2*rvec + rvec + vpaddd ymm4, ymm7, ymm8 + vpslld ymm8, ymm5, 1 + vpaddd ymm5, ymm7, ymm8 + vpslld ymm8, ymm6, 1 + vpaddd ymm6, ymm7, ymm8 + vpmulld ymm4, ymm4, ymm15 + vpmulld ymm5, ymm5, ymm15 + vpmulld ymm6, ymm6, ymm15 + vpmulld ymm7, ymm7, ymm15 + vpslld ymm0, ymm0, 3 // 8*x + vpslld ymm1, ymm1, 3 + vpslld ymm2, ymm2, 3 + vpslld ymm3, ymm3, 3 + vpsubd ymm0, ymm0, ymm4 // t[i] + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + + vpsrad ymm8, ymm0, 31 // mask1 + vpabsd ymm4, ymm0 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymm4 + vpabsd ymm0, ymm0 + vpsrad ymm8, ymm1, 31 // mask1 + vpabsd ymm4, ymm1 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm1, ymm1, ymm4 + vpabsd ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpsrad ymm8, ymm2, 31 // mask1 + vpabsd ymm4, ymm2 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm2, ymm2, ymm4 + vpabsd ymm2, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpsrad ymm8, ymm3, 31 // mask1 + vpabsd ymm4, ymm3 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm3, ymm3, ymm4 + vpabsd ymm3, ymm3 + vpaddd ymm0, ymm0, ymm3 // norm + + vpsubd ymm0, ymm13, ymm0 // If norm < PARAMETER_Q then result = 1, else result = 0 + vpsrld ymm0, ymm0, 31 + vpxor ymm0, ymm0, ymm10 + + vpsrlq ymm1, ymm0, 31 + vpor ymm1, ymm0, ymm1 + vpsllq ymm2, ymm1, 2 + vpsrldq ymm2, ymm2, 8 + vpor ymm1, ymm2, ymm1 + vpsllq ymm2, ymm1, 4 + vpermq ymm2, ymm2, 0x56 + vpor ymm0, ymm1, ymm2 + vmovq r9, xmm0 + + mov BYTE PTR [reg_p3+rcx], r9b + + add rax, r10 // j+8 + inc rcx + cmp rax, r11 + jl loop3 + ret diff --git a/src/kex_rlwe_msrln16/AMD64/ntt_x64.c b/src/kex_rlwe_msrln16/AMD64/ntt_x64.c new file mode 100644 index 000000000..8322e81ed --- /dev/null +++ b/src/kex_rlwe_msrln16/AMD64/ntt_x64.c @@ -0,0 +1,65 @@ +/**************************************************************************************** +* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: NTT functions and other low-level operations +* +*****************************************************************************************/ + +#include "../LatticeCrypto_priv.h" + + +void oqs_rlwe_msrln16_NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N) +{ + oqs_rlwe_msrln16_NTT_CT_std2rev_12289_asm(a, psi_rev, N); +} + + +void oqs_rlwe_msrln16_INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N) +{ + oqs_rlwe_msrln16_INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N); +} + + +void oqs_rlwe_msrln16_two_reduce12289(int32_t* a, unsigned int N) +{ + oqs_rlwe_msrln16_two_reduce12289_asm(a, N); +} + + +void oqs_rlwe_msrln16_pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N) +{ + oqs_rlwe_msrln16_pmul_asm(a, b, c, N); +} + + +void oqs_rlwe_msrln16_pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N) +{ + oqs_rlwe_msrln16_pmuladd_asm(a, b, c, d, N); +} + + +void oqs_rlwe_msrln16_smul(int32_t* a, int32_t scalar, unsigned int N) +{ + unsigned int i; + + for (i = 0; i < N; i++) { + a[i] = a[i]*scalar; + } +} + + +void oqs_rlwe_msrln16_correction(int32_t* a, int32_t p, unsigned int N) +{ + unsigned int i; + int32_t mask; + + for (i = 0; i < N; i++) { + mask = a[i] >> (4*sizeof(int32_t) - 1); + a[i] += (p & mask) - p; + mask = a[i] >> (4*sizeof(int32_t) - 1); + a[i] += (p & mask); + } +} diff --git a/src/kex_rlwe_msrln16/AMD64/ntt_x64_asm.S b/src/kex_rlwe_msrln16/AMD64/ntt_x64_asm.S new file mode 100644 index 000000000..b6e58f483 --- /dev/null +++ b/src/kex_rlwe_msrln16/AMD64/ntt_x64_asm.S @@ -0,0 +1,979 @@ +//**************************************************************************************** +// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// Abstract: NTT functions in x64 assembly using AVX2 vector instructions for Linux +// +//**************************************************************************************** + +.intel_syntax noprefix + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx +#define reg_p4 rcx +#define reg_p5 r8 + + +.text +//*********************************************************************** +// Forward NTT +// Operation: a [reg_p1] <- NTT(a) [reg_p1], +// [reg_p2] points to table and +// reg_p3 contains parameter n +//*********************************************************************** +.global oqs_rlwe_msrln16_NTT_CT_std2rev_12289_asm +oqs_rlwe_msrln16_NTT_CT_std2rev_12289_asm: + push r12 + push r13 + push r14 + +// Stages m=1 -> m=32 + mov r9, 1 // m = 1 + mov rax, reg_p3 + mov r12, reg_p3 + shr r12, 4 // n/16 + vmovdqu ymm14, MASK12x8 + vmovdqu ymm12, PERM0246 + mov r14, 16 + mov rcx, 11 +loop1: + shr rax, 1 // k = k/2 + dec rcx + xor rdx, rdx // i = 0 +loop2: + mov r10, rdx + mov r11, rax + dec r11 + shl r10, cl // j1 + add r11, r10 // j2 + mov r13, r9 + add r13, rdx // m+i + vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13] // S + +loop3: + mov r13, r10 + add r13, rax // j+k + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r13] // a[j+k] + vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k] + vpmovsxdq ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k] + vpmovsxdq ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k] + + vpmuldq ymm1, ymm1, ymm11 // a[j+k].S + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm13 // a[j] = U + V + vpermd ymm1, ymm12, ymm1 + vpermd ymm0, ymm12, ymm0 + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm3, ymm2, ymm13 // a[j+k] = U - V + vpaddd ymm2, ymm2, ymm13 // a[j] = U + V + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 + vpermd ymm3, ymm12, ymm3 + vpermd ymm2, ymm12, ymm2 + vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] + + vmovdqu ymm13, ymm5 + vpand ymm5, ymm14, ymm5 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm5, 1 // 2*c0 + vpsubd ymm13, ymm5, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm5, ymm4, ymm13 // a[j+k] = U - V + vpaddd ymm4, ymm4, ymm13 // a[j] = U + V + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 + vpermd ymm5, ymm12, ymm5 + vpermd ymm4, ymm12, ymm4 + vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] + + vmovdqu ymm13, ymm7 + vpand ymm7, ymm14, ymm7 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm7, 1 // 2*c0 + vpsubd ymm13, ymm7, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm7, ymm6, ymm13 // a[j+k] = U - V + vpaddd ymm6, ymm6, ymm13 // a[j] = U + V + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 + vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 + vpermd ymm6, ymm12, ymm6 + vpermd ymm7, ymm12, ymm7 + vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 + + add r10, r14 + cmp r10, r11 + jl loop3 + inc rdx + cmp rdx, r9 + jl loop2 + shl r9, 1 + cmp r9, r12 + jl loop1 + +// Stage m=64 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 +loop4: + vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k] + vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + vpmuldq ymm1, ymm1, ymm11 // a[j+k].S + vpmuldq ymm3, ymm3, ymm11 // a[j+k].S + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + + vmovdqu ymm10, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm10, ymm10, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm10, ymm3, ymm10 // c0-c1 + vpaddd ymm10, ymm10, ymm15 // V = 3*c0-c1 + + vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm13 // a[j] = U + V + vpsubd ymm3, ymm2, ymm10 // a[j+k] = U - V + vpaddd ymm2, ymm2, ymm10 // a[j] = U + V + + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vpermd ymm2, ymm12, ymm2 + vpermd ymm3, ymm12, ymm3 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 + + add r10, r14 // j+16 + inc rdx // i+1 + cmp rdx, r9 + jl loop4 + +// Stage m=128 + shl r9, 1 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 + mov r13, 8 +loop6: + vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmuldq ymm1, ymm1, ymm2 // a[j+k].S + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm14, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // U = 3*c0-c1 + + vmovdqu ymm3, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm4, ymm3, 24 // c2 + vpsrad ymm3, ymm3, 12 // xc1 + vpand ymm3, ymm14, ymm3 // c1 + vpslld ymm5, ymm1, 3 // 8*c0 + vpaddd ymm4, ymm1, ymm4 // c0+c2 + vpaddd ymm4, ymm4, ymm5 // 9*c0+c2 + vpslld ymm5, ymm3, 1 // 2*c1 + vpaddd ymm1, ymm0, ymm3 // U+c1 + vpsubd ymm0, ymm0, ymm3 // U-c1 + vpsubd ymm4, ymm4, ymm5 // 9*c0-2*c1+c2 + vpaddd ymm0, ymm0, ymm4 // U+(9*c0-3*c1+c2) + vpsubd ymm1, ymm1, ymm4 // U-(9*c0-3*c1+c2) + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 + + add r10, r13 // j+8 + inc rdx // i+1 + cmp rdx, r9 + jl loop6 + +// Stage m=256 + vmovdqu ymm9, PERM02134657 + shl r9, 1 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 32 +loop7: + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256] // S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + vpermq ymm8, ymm2, 0xfa + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 + + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16] // S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 + + vpermq ymm8, ymm2, 0xfa + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 + + add r10, r14 // j+32 + add rdx, r13 // i+8 + cmp rdx, r9 + jl loop7 + +// Stage m=512 + vmovdqu ymm9, PERM00224466 + shl r9, 1 // m = n/2 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 4 +loop8: + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // a[j+k] + vpmuldq ymm3, ymm1, ymm2 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpermd ymm1, ymm9, ymm1 + vpblendd ymm0, ymm0, ymm1, 0xaa + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + add r10, r13 // j+8 + add rdx, r14 // i+4 + cmp rdx, r9 + jl loop8 + + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Inverse NTT +// Operation: a [reg_p1] <- INTT(a) [reg_p1], +// [reg_p2] points to table +// reg_p3 and reg_p4 point to constants for scaling and +// reg_p5 contains parameter n +//*********************************************************************** +.global oqs_rlwe_msrln16_INTT_GS_rev2std_12289_asm +oqs_rlwe_msrln16_INTT_GS_rev2std_12289_asm: + push r12 + push r13 + push r14 + push r15 + push rbx + +// Stage m=1024 + vmovdqu ymm9, PERM00224466 + vmovdqu ymm14, MASK12x8 + mov r12, reg_p5 + shr r12, 1 // n/2 = 512 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 + mov r13, 8 + mov r14, 4 +loop1b: + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // V = a[j+k] + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*512] // S + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm2 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpermd ymm1, ymm9, ymm1 + vpblendd ymm0, ymm0, ymm1, 0xaa + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + add r10, r13 // j+8 + add r15, r14 // i+4 + cmp r15, r12 + jl loop1b + +// Stage m=512 + vmovdqu ymm9, PERM02134657 + vmovdqu ymm13, PERM0145 + vmovdqu ymm15, PERM2367 + shr r12, 1 // n/4 = 256 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 32 +loop2b: + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256] // S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + vpermq ymm8, ymm2, 0xfa + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 + + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 + + vpermq ymm8, ymm2, 0xfa + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 + + add r10, r14 // j+32 + add r15, r13 // i+8 + cmp r15, r12 + jl loop2b + +// Stage m=256 + vmovdqu ymm12, PERM0246 + shr r12, 1 // n/8 = 128 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 +loop3b: + vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128] // S + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm2 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 + + add r10, r13 // j+8 + inc r15 // i+1 + cmp r15, r12 + jl loop3b + +// Stage m=128 + shr r12, 1 // n/16 = 64 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 16 +loop4b: + vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64] // S + vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+32] // V = a[j+k] + vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r10+48] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + vpsubd ymm1, ymm0, ymm13 // U - V + vpaddd ymm0, ymm0, ymm13 // U + V + vpsubd ymm3, ymm2, ymm15 // U - V + vpaddd ymm2, ymm2, ymm15 // U + V + vpmuldq ymm1, ymm1, ymm11 // (U - V).S + vpmuldq ymm3, ymm3, ymm11 // (U - V).S + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 + + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vpermd ymm2, ymm12, ymm2 + vpermd ymm3, ymm12, ymm3 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 + + add r10, r14 // j+16 + inc r15 // i+1 + cmp r15, r12 + jl loop4b + +// Stages m=64 -> m=4 + mov r9, 5 // 5 iterations + mov rax, 8 +loop5b: + shl rax, 1 // k = 2*k + shr r12, 1 // m/2 + xor r15, r15 // i = 0 + xor r8, r8 +loop6b: + mov r10, r8 // Load j1 + mov r11, rax + dec r11 + add r11, r10 // j2 + mov r13, r12 + add r13, r15 // m/2+i + vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13] // S + mov rbx, 4 + +loop7b: + mov r13, r10 + add r13, rax // j+k + vpmovsxdq ymm10, XMMWORD PTR [reg_p1+4*r13] // V = a[j+k] + vpmovsxdq ymm11, XMMWORD PTR [reg_p1+4*r13+16] // V = a[j+k] + vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r13+32] // V = a[j+k] + vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r13+48] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] + vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] + + vpsubd ymm1, ymm0, ymm10 // U - V + vpaddd ymm0, ymm0, ymm10 // U + V + vpsubd ymm3, ymm2, ymm11 // U - V + vpaddd ymm2, ymm2, ymm11 // U + V + vpsubd ymm5, ymm4, ymm13 // U - V + vpaddd ymm4, ymm4, ymm13 // U + V + vpsubd ymm7, ymm6, ymm15 // U - V + vpaddd ymm6, ymm6, ymm15 // U + V + + vpmuldq ymm1, ymm1, ymm9 // (U - V).S + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm5, ymm5, ymm9 + vpmuldq ymm7, ymm7, ymm9 + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 + + cmp r9, rbx + jne skip1 + vmovdqu ymm13, ymm0 + vpand ymm0, ymm14, ymm0 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm0, 1 // 2*c0 + vpsubd ymm13, ymm0, ymm13 // c0-c1 + vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 +skip1: + vpermd ymm1, ymm12, ymm1 + vpermd ymm0, ymm12, ymm0 + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 + + cmp r9, rbx + jne skip2 + vmovdqu ymm13, ymm2 + vpand ymm2, ymm14, ymm2 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm2, 1 // 2*c0 + vpsubd ymm13, ymm2, ymm13 // c0-c1 + vpaddd ymm2, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 +skip2: + vpermd ymm3, ymm12, ymm3 + vpermd ymm2, ymm12, ymm2 + + vmovdqu ymm13, ymm5 + vpand ymm5, ymm14, ymm5 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm5, 1 // 2*c0 + vpsubd ymm13, ymm5, ymm13 // c0-c1 + vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 + + cmp r9, rbx + jne skip3 + vmovdqu ymm13, ymm4 + vpand ymm4, ymm14, ymm4 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm4, 1 // 2*c0 + vpsubd ymm13, ymm4, ymm13 // c0-c1 + vpaddd ymm4, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm5 + vpand ymm5, ymm14, ymm5 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm5, 1 // 2*c0 + vpsubd ymm13, ymm5, ymm13 // c0-c1 + vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 +skip3: + vpermd ymm5, ymm12, ymm5 + vpermd ymm4, ymm12, ymm4 + + vmovdqu ymm13, ymm7 + vpand ymm7, ymm14, ymm7 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm7, 1 // 2*c0 + vpsubd ymm13, ymm7, ymm13 // c0-c1 + vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 + vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 + + cmp r9, rbx + jne skip4 + vmovdqu ymm13, ymm6 + vpand ymm6, ymm14, ymm6 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm6, 1 // 2*c0 + vpsubd ymm13, ymm6, ymm13 // c0-c1 + vpaddd ymm6, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm7 + vpand ymm7, ymm14, ymm7 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm7, 1 // 2*c0 + vpsubd ymm13, ymm7, ymm13 // c0-c1 + vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 +skip4: + vpermd ymm7, ymm12, ymm7 + vpermd ymm6, ymm12, ymm6 + vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 + + add r10, r14 + cmp r10, r11 + jl loop7b + mov rbx, rax + shl rbx, 1 // 2*k + add r8, rbx // j1+2*k + inc r15 + cmp r15, r12 + jl loop6b + dec r9 + jnz loop5b + +// Scaling step + shl rax, 1 // k = 2*k = 512 + xor r10, r10 // j = 0 + mov r14, 4 + movq xmm0, reg_p3 + vbroadcastsd ymm10, xmm0 // S = omegainv1N_rev + movq xmm0, reg_p4 + vbroadcastsd ymm11, xmm0 // T = Ninv +loop8b: + vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+4*512] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpsubd ymm1, ymm0, ymm13 // U - V + vpaddd ymm0, ymm0, ymm13 // U + V + vpmuldq ymm1, ymm1, ymm10 // (U - V).S + vpmuldq ymm0, ymm0, ymm11 // (U + V).T + + vmovdqu ymm13, ymm0 + vpand ymm0, ymm14, ymm0 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm0, 1 // 2*c0 + vpsubd ymm13, ymm0, ymm13 // c0-c1 + vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 + + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+4*512], xmm1 + + add r10, r14 // j+4 + cmp r10, rax + jl loop8b +loop9b: + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Component-wise multiplication and addition +// Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3] +// reg_p5 contains parameter n +//*********************************************************************** +.global oqs_rlwe_msrln16_pmuladd_asm +oqs_rlwe_msrln16_pmuladd_asm: + vmovdqu ymm5, PERM0246 + vmovdqu ymm6, MASK12x8 + xor rax, rax + movq r11, 4 +lazo2: + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a + vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b + vpmovsxdq ymm2, XMMWORD PTR [reg_p3+4*rax] // c + vpmuldq ymm0, ymm1, ymm0 + vpaddq ymm0, ymm2, ymm0 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrlq ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vpermd ymm0, ymm5, ymm0 + vmovdqu XMMWORD PTR [reg_p4+4*rax], xmm0 + + add rax, r11 // j+4 + cmp rax, reg_p5 + jl lazo2 + ret + + +//*********************************************************************** +// Component-wise multiplication +// Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2] +// reg_p4 contains parameter n +//*********************************************************************** +.global oqs_rlwe_msrln16_pmul_asm +oqs_rlwe_msrln16_pmul_asm: + vmovdqu ymm5, PERM0246 + vmovdqu ymm6, MASK12x8 + xor rax, rax + movq r11, 4 +lazo3: + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a + vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b + vpmuldq ymm0, ymm1, ymm0 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrlq ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vpermd ymm0, ymm5, ymm0 + vmovdqu XMMWORD PTR [reg_p3+4*rax], xmm0 + + add rax, r11 // j+4 + cmp rax, reg_p4 + jl lazo3 + ret + + +//*********************************************************************** +// Two consecutive reductions +// Operation: c [reg_p1] <- a [reg_p1] +// reg_p2 contains parameter n +//*********************************************************************** +.global oqs_rlwe_msrln16_two_reduce12289_asm +oqs_rlwe_msrln16_two_reduce12289_asm: + vmovdqu ymm6, MASK12x8 + vmovdqu ymm7, PRIME8x + xor rax, rax + movq r11, 8 +lazo4: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vpsrad ymm2, ymm0, 31 + vpand ymm2, ymm7, ymm2 + vpaddd ymm2, ymm0, ymm2 + vpsubd ymm0, ymm2, ymm7 + + vpsrad ymm2, ymm0, 31 + vpand ymm2, ymm7, ymm2 + vpaddd ymm0, ymm0, ymm2 + + vmovdqu YMMWORD PTR [reg_p1+4*rax], ymm0 + + add rax, r11 // j+8 + cmp rax, reg_p2 + jl lazo4 + ret + + +//*********************************************************************** +// Encoding +// Operation: c [reg_p2] <- a [reg_p1] +//*********************************************************************** +.global oqs_rlwe_msrln16_encode_asm +oqs_rlwe_msrln16_encode_asm: + vmovdqu ymm6, MASK32 + vmovdqu ymm7, MASK42 + mov r9, 1024 + xor rax, rax + xor r10, r10 + mov r11, 14 + mov rcx, 8 +lazo5: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a + + vpsrlq ymm1, ymm0, 18 + vpsllq ymm2, ymm0, 4 + vpand ymm0, ymm0, ymm6 + vpsrldq ymm2, ymm2, 5 + vpsrlq ymm3, ymm1, 4 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm7 + vpsrldq ymm3, ymm3, 4 + vpor ymm0, ymm0, ymm1 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpermq ymm1, ymm0, 0x0e + + vmovdqu XMMWORD PTR [reg_p2+r10], xmm0 + vmovdqu XMMWORD PTR [reg_p2+r10+7], xmm1 + + add r10, r11 + add rax, rcx // j+8 + cmp rax, r9 + jl lazo5 + ret + + +//*********************************************************************** +// Decoding +// Operation: c [reg_p2] <- a [reg_p1] +//*********************************************************************** +.global oqs_rlwe_msrln16_decode_asm +oqs_rlwe_msrln16_decode_asm: + vmovdqu ymm6, MASK14_1 + vmovdqu ymm7, MASK14_2 + vmovdqu ymm8, MASK14_3 + vmovdqu ymm9, MASK14_4 + mov r9, 1024 + xor rax, rax + xor r10, r10 + mov r11, 14 + mov rcx, 8 +lazo6: + vmovdqu xmm0, XMMWORD PTR [reg_p1+r10] + vmovdqu xmm1, XMMWORD PTR [reg_p1+r10+7] + vinserti128 ymm0, ymm0, xmm1, 1 + + vpand ymm1, ymm0, ymm6 + vpand ymm2, ymm0, ymm7 + vpand ymm3, ymm0, ymm8 + vpand ymm4, ymm0, ymm9 + + vpsllq ymm2, ymm2, 18 + vpsllq ymm3, ymm3, 4 + vpslldq ymm3, ymm3, 4 + vpsrlq ymm4, ymm4, 2 + vpslldq ymm4, ymm4, 7 + + vpor ymm1, ymm1, ymm2 + vpor ymm1, ymm1, ymm3 + vpor ymm1, ymm1, ymm4 + + vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm1 + + add r10, r11 + add rax, rcx // j+8 + cmp rax, r9 + jl lazo6 + ret diff --git a/src/kex_rlwe_msrln16/LatticeCrypto.h b/src/kex_rlwe_msrln16/LatticeCrypto.h new file mode 100644 index 000000000..db83a43b1 --- /dev/null +++ b/src/kex_rlwe_msrln16/LatticeCrypto.h @@ -0,0 +1,94 @@ +/*************************************************************************************** +* LatticeCrypt: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: main header file +* +****************************************************************************************/ + +#ifndef __LatticeCrypt_H__ +#define __LatticeCrypt_H__ + + +// For C++ +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include +#include +#include + +// NOTE: probably a better way to do this. +#if (defined(__x86_64__) || defined(__x86_64) || defined(__arch64__) || defined(_M_AMD64) || defined(_M_X64) || defined(_WIN64) || !defined(__LP64__)) +#define RADIX 64 +typedef uint64_t digit_t; // Unsigned 64-bit digit +typedef int64_t sdigit_t; // Signed 64-bit digit +#else +#define RADIX 32 +typedef uint32_t digit_t; // Unsigned 32-bit digit +typedef int32_t sdigit_t; // Signed 32-bit digit + +#endif + +// Definitions of the error-handling type and error codes + +typedef enum { + CRYPTO_SUCCESS, // 0x00 + CRYPTO_ERROR, // 0x01 + CRYPTO_ERROR_DURING_TEST, // 0x02 + CRYPTO_ERROR_UNKNOWN, // 0x03 + CRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04 + CRYPTO_ERROR_NO_MEMORY, // 0x05 + CRYPTO_ERROR_INVALID_PARAMETER, // 0x06 + CRYPTO_ERROR_SHARED_KEY, // 0x07 + CRYPTO_ERROR_TOO_MANY_ITERATIONS, // 0x08 + CRYPTO_ERROR_END_OF_LIST +} CRYPTO_STATUS; + +#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_ERROR_END_OF_LIST) + +// Basic key-exchange constants +#define OQS_RLWE_MSRLN16_PKA_BYTES 1824 // Alice's public key size +#define OQS_RLWE_MSRLN16_PKB_BYTES 2048 // Bob's public key size +#define OQS_RLWE_MSRLN16_SHAREDKEY_BYTES 32 // Shared key size + +/******************** Function prototypes *******************/ + +// Clear digits from memory. "nwords" indicates the number of digits to be zeroed. +extern void oqs_rlwe_msrln16_clear_words(void *mem, digit_t nwords); + +/*********************** Key exchange API ***********************/ + +// Alice's key generation +// It produces a private key SecretKeyA and computes the public key PublicKeyA. +// Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) +// the public key PublicKeyA that occupies 1824 bytes +CRYPTO_STATUS oqs_rlwe_msrln16_KeyGeneration_A(int32_t *SecretKeyA, unsigned char *PublicKeyA, OQS_RAND *rand); + +// Bob's key generation and shared secret computation +// It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes +// the shared secret SharedSecretB. +// Input: Alice's public key PublicKeyA that consists of 1824 bytes +// Outputs: the public key PublicKeyB that occupies 2048 bytes. +// the 256-bit shared secret SharedSecretB. +CRYPTO_STATUS oqs_rlwe_msrln16_SecretAgreement_B(unsigned char *PublicKeyA, unsigned char *SharedSecretB, unsigned char *PublicKeyB, OQS_RAND *rand); + +// Alice's shared secret computation +// It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA. +// Inputs: Bob's public key PublicKeyB that consists of 2048 bytes +// the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) +// Output: the 256-bit shared secret SharedSecretA. +CRYPTO_STATUS oqs_rlwe_msrln16_SecretAgreement_A(unsigned char *PublicKeyB, int32_t *SecretKeyA, unsigned char *SharedSecretA); + + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/src/kex_rlwe_msrln16/LatticeCrypto_kex.c b/src/kex_rlwe_msrln16/LatticeCrypto_kex.c new file mode 100644 index 000000000..b05c010ca --- /dev/null +++ b/src/kex_rlwe_msrln16/LatticeCrypto_kex.c @@ -0,0 +1,452 @@ +/**************************************************************************************** + * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library + * + * Copyright (c) Microsoft Corporation. All rights reserved. + * + * + * Abstract: Ring-LWE key exchange + * The implementation is based on the instantiation of Peikert's key exchange [1] + * due to Alkim, Ducas, Poppelmann and Schwabe [2]. + * + * [1] C. Peikert, "Lattice cryptography for the internet", in Post-Quantum Cryptography - + * 6th International Workshop (PQCrypto 2014), LNCS 8772, pp. 197-219. Springer, 2014. + * [2] E. Alkim, L. Ducas, T. Pppelmann and P. Schwabe, "Post-quantum key exchange - a new + * hope", IACR Cryptology ePrint Archive, Report 2015/1092, 2015. + * + ******************************************************************************************/ + +#include "LatticeCrypto_priv.h" +#include "oqs/rand.h" +#include "external/shake128.h" + +extern const int32_t psi_rev_ntt1024_12289[1024]; +extern const int32_t omegainv_rev_ntt1024_12289[1024]; +extern const int32_t omegainv10N_rev_ntt1024_12289; +extern const int32_t Ninv11_ntt1024_12289; + +// import external code +#include "external/shake128.c" +#ifdef RLWE_ASM_AVX2 +#include "AMD64/consts.c" +#include "AMD64/ntt_x64.c" +#else +#include "generic/ntt.c" +#endif + + +__inline void oqs_rlwe_msrln16_clear_words(void *mem, digit_t nwords) { + // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. + // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. + unsigned int i; + volatile digit_t *v = mem; + + for (i = 0; i < nwords; i++) { + v[i] = 0; + } +} + +void oqs_rlwe_msrln16_encode_A(const uint32_t *pk, const unsigned char *seed, unsigned char *m) { + // Alice's message encoding + unsigned int i = 0, j; +#if defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_encode_asm(pk, m); + i = 1792; +#else + for (j = 0; j < 1024; j += 4) { + m[i] = (unsigned char)(pk[j] & 0xFF); + m[i + 1] = (unsigned char)((pk[j] >> 8) | ((pk[j + 1] & 0x03) << 6)); + m[i + 2] = (unsigned char)((pk[j + 1] >> 2) & 0xFF); + m[i + 3] = (unsigned char)((pk[j + 1] >> 10) | ((pk[j + 2] & 0x0F) << 4)); + m[i + 4] = (unsigned char)((pk[j + 2] >> 4) & 0xFF); + m[i + 5] = (unsigned char)((pk[j + 2] >> 12) | ((pk[j + 3] & 0x3F) << 2)); + m[i + 6] = (unsigned char)(pk[j + 3] >> 6); + i += 7; + } +#endif + + for (j = 0; j < 32; j++) { + m[i + j] = seed[j]; + } +} + + +void oqs_rlwe_msrln16_decode_A(const unsigned char *m, uint32_t *pk, unsigned char *seed) { + // Alice's message decoding + unsigned int i = 0, j; + +#if defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_decode_asm(m, pk); + i = 1792; +#else + for (j = 0; j < 1024; j += 4) { + pk[j] = ((uint32_t)m[i] | (((uint32_t)m[i + 1] & 0x3F) << 8)); + pk[j + 1] = (((uint32_t)m[i + 1] >> 6) | ((uint32_t)m[i + 2] << 2) | (((uint32_t)m[i + 3] & 0x0F) << 10)); + pk[j + 2] = (((uint32_t)m[i + 3] >> 4) | ((uint32_t)m[i + 4] << 4) | (((uint32_t)m[i + 5] & 0x03) << 12)); + pk[j + 3] = (((uint32_t)m[i + 5] >> 2) | ((uint32_t)m[i + 6] << 6)); + i += 7; + } +#endif + + for (j = 0; j < 32; j++) { + seed[j] = m[i + j]; + } +} + + +void oqs_rlwe_msrln16_encode_B(const uint32_t *pk, const uint32_t *rvec, unsigned char *m) { + // Bob's message encoding + unsigned int i = 0, j; + +#if defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_encode_asm(pk, m); +#else + for (j = 0; j < 1024; j += 4) { + m[i] = (unsigned char)(pk[j] & 0xFF); + m[i + 1] = (unsigned char)((pk[j] >> 8) | ((pk[j + 1] & 0x03) << 6)); + m[i + 2] = (unsigned char)((pk[j + 1] >> 2) & 0xFF); + m[i + 3] = (unsigned char)((pk[j + 1] >> 10) | ((pk[j + 2] & 0x0F) << 4)); + m[i + 4] = (unsigned char)((pk[j + 2] >> 4) & 0xFF); + m[i + 5] = (unsigned char)((pk[j + 2] >> 12) | ((pk[j + 3] & 0x3F) << 2)); + m[i + 6] = (unsigned char)(pk[j + 3] >> 6); + i += 7; + } +#endif + + i = 0; + for (j = 0; j < 1024 / 4; j++) { + m[1792 + j] = (unsigned char)(rvec[i] | (rvec[i + 1] << 2) | (rvec[i + 2] << 4) | (rvec[i + 3] << 6)); + i += 4; + } +} + + +void oqs_rlwe_msrln16_decode_B(unsigned char *m, uint32_t *pk, uint32_t *rvec) { + // Bob's message decoding + unsigned int i = 0, j; + +#if defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_decode_asm(m, pk); + i = 1792; +#else + for (j = 0; j < 1024; j += 4) { + pk[j] = ((uint32_t)m[i] | (((uint32_t)m[i + 1] & 0x3F) << 8)); + pk[j + 1] = (((uint32_t)m[i + 1] >> 6) | ((uint32_t)m[i + 2] << 2) | (((uint32_t)m[i + 3] & 0x0F) << 10)); + pk[j + 2] = (((uint32_t)m[i + 3] >> 4) | ((uint32_t)m[i + 4] << 4) | (((uint32_t)m[i + 5] & 0x03) << 12)); + pk[j + 3] = (((uint32_t)m[i + 5] >> 2) | ((uint32_t)m[i + 6] << 6)); + i += 7; + } +#endif + + i = 0; + for (j = 0; j < 1024 / 4; j++) { + rvec[i] = (uint32_t)(m[1792 + j] & 0x03); + rvec[i + 1] = (uint32_t)((m[1792 + j] >> 2) & 0x03); + rvec[i + 2] = (uint32_t)((m[1792 + j] >> 4) & 0x03); + rvec[i + 3] = (uint32_t)(m[1792 + j] >> 6); + i += 4; + } +} + + +static __inline uint32_t Abs(int32_t value) { + // Compute absolute value + uint32_t mask; + + mask = (uint32_t)(value >> 31); + return ((mask ^ value) - mask); +} + + +CRYPTO_STATUS oqs_rlwe_msrln16_HelpRec(const uint32_t *x, uint32_t *rvec, OQS_RAND *rand) { + // Reconciliation helper + unsigned int i, j, norm; + unsigned char bit, random_bits[32]; + uint32_t v0[4], v1[4]; + // OQS integration note: call to aux API replaced with direct call to OQS_RAND + rand->rand_n(rand, random_bits, 32); + +#if defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_helprec_asm(x, rvec, random_bits); +#else + for (i = 0; i < 256; i++) { + bit = 1 & (random_bits[i >> 3] >> (i & 0x07)); + rvec[i] = (x[i] << 1) - bit; + rvec[i + 256] = (x[i + 256] << 1) - bit; + rvec[i + 512] = (x[i + 512] << 1) - bit; + rvec[i + 768] = (x[i + 768] << 1) - bit; + + norm = 0; + v0[0] = 4; + v0[1] = 4; + v0[2] = 4; + v0[3] = 4; + v1[0] = 3; + v1[1] = 3; + v1[2] = 3; + v1[3] = 3; + for (j = 0; j < 4; j++) { + v0[j] -= (rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_Q4 ) >> 31; + v0[j] -= (rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_3Q4) >> 31; + v0[j] -= (rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_5Q4) >> 31; + v0[j] -= (rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_7Q4) >> 31; + v1[j] -= (rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_Q2 ) >> 31; + v1[j] -= (rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_Q ) >> 31; + v1[j] -= (rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_3Q2) >> 31; + norm += Abs(2 * rvec[i + 256 * j] - OQS_RLWE_MSRLN16_PARAMETER_Q * v0[j]); + } + + norm = (uint32_t)((int32_t)(norm - OQS_RLWE_MSRLN16_PARAMETER_Q) >> 31); // If norm < q then norm = 0xff...ff, else norm = 0 + v0[0] = (norm & (v0[0] ^ v1[0])) ^ v1[0]; + v0[1] = (norm & (v0[1] ^ v1[1])) ^ v1[1]; + v0[2] = (norm & (v0[2] ^ v1[2])) ^ v1[2]; + v0[3] = (norm & (v0[3] ^ v1[3])) ^ v1[3]; + rvec[i] = (v0[0] - v0[3]) & 0x03; + rvec[i + 256] = (v0[1] - v0[3]) & 0x03; + rvec[i + 512] = (v0[2] - v0[3]) & 0x03; + rvec[i + 768] = ((v0[3] << 1) + (1 & ~norm)) & 0x03; + } +#endif + + return CRYPTO_SUCCESS; +} + + +static __inline uint32_t LDDecode(int32_t *t) { + // Low-density decoding + unsigned int i, norm = 0; + uint32_t mask1, mask2, value; + int32_t cneg = -8 * OQS_RLWE_MSRLN16_PARAMETER_Q; + + for (i = 0; i < 4; i++) { + mask1 = t[i] >> 31; // If t[i] < 0 then mask2 = 0xff...ff, else mask2 = 0 + mask2 = (4 * OQS_RLWE_MSRLN16_PARAMETER_Q - (int32_t)Abs(t[i])) >> 31; // If 4*PARAMETER_Q > Abs(t[i]) then mask2 = 0, else mask2 = 0xff...ff + + value = ((mask1 & (8 * OQS_RLWE_MSRLN16_PARAMETER_Q ^ cneg)) ^ cneg); + norm += Abs(t[i] + (mask2 & value)); + } + + return ((8 * OQS_RLWE_MSRLN16_PARAMETER_Q - norm) >> 31) ^ 1; // If norm < PARAMETER_Q then return 1, else return 0 +} + + +void oqs_rlwe_msrln16_Rec(const uint32_t *x, const uint32_t *rvec, unsigned char *key) { + // Reconciliation + + +#if defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_rec_asm(x, rvec, key); +#else + unsigned int i; + uint32_t t[4]; + + for (i = 0; i < 32; i++) { + key[i] = 0; + } + for (i = 0; i < 256; i++) { + t[0] = 8 * x[i] - (2 * rvec[i] + rvec[i + 768]) * OQS_RLWE_MSRLN16_PARAMETER_Q; + t[1] = 8 * x[i + 256] - (2 * rvec[i + 256] + rvec[i + 768]) * OQS_RLWE_MSRLN16_PARAMETER_Q; + t[2] = 8 * x[i + 512] - (2 * rvec[i + 512] + rvec[i + 768]) * OQS_RLWE_MSRLN16_PARAMETER_Q; + t[3] = 8 * x[i + 768] - (rvec[i + 768]) * OQS_RLWE_MSRLN16_PARAMETER_Q; + + key[i >> 3] |= (unsigned char)LDDecode((int32_t *)t) << (i & 0x07); + } +#endif +} + + +CRYPTO_STATUS oqs_rlwe_msrln16_get_error(int32_t *e, OQS_RAND *rand) { + // Error sampling + unsigned char stream[3 * OQS_RLWE_MSRLN16_PARAMETER_N]; + uint32_t *pstream = (uint32_t *)&stream; + uint32_t acc1, acc2, temp; + uint8_t *pacc1 = (uint8_t *)&acc1, *pacc2 = (uint8_t *)&acc2; + unsigned int i, j; + + // OQS integration note: call to aux API replaced with direct call to OQS_RAND + rand->rand_n(rand, stream, 3 * OQS_RLWE_MSRLN16_PARAMETER_N); + +#if defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_error_sampling_asm(stream, e); +#else + for (i = 0; i < OQS_RLWE_MSRLN16_PARAMETER_N / 4; i++) { + acc1 = 0; + acc2 = 0; + for (j = 0; j < 8; j++) { + acc1 += (pstream[i] >> j) & 0x01010101; + acc2 += (pstream[i + OQS_RLWE_MSRLN16_PARAMETER_N / 4] >> j) & 0x01010101; + } + for (j = 0; j < 4; j++) { + temp = pstream[i + 2 * OQS_RLWE_MSRLN16_PARAMETER_N / 4] >> j; + acc1 += temp & 0x01010101; + acc2 += (temp >> 4) & 0x01010101; + } + e[2 * i] = pacc1[0] - pacc1[1]; + e[2 * i + 1] = pacc1[2] - pacc1[3]; + e[2 * i + OQS_RLWE_MSRLN16_PARAMETER_N / 2] = pacc2[0] - pacc2[1]; + e[2 * i + OQS_RLWE_MSRLN16_PARAMETER_N / 2 + 1] = pacc2[2] - pacc2[3]; + } +#endif + + return CRYPTO_SUCCESS; +} + +CRYPTO_STATUS oqs_rlwe_msrln16_generate_a(uint32_t *a, const unsigned char *seed) { + // Generation of parameter a + // OQS integration note: call to aux API replaced with direct call to shake128 + unsigned int pos = 0, ctr = 0; + uint16_t val; + unsigned int nblocks = 16; + uint8_t buf[SHAKE128_RATE * 16]; + unsigned char state[SHAKE128_STATE_SIZE] = { 0 }; + FIPS202_SHAKE128_Absorb(seed, OQS_RLWE_MSRLN16_SEED_BYTES, state, sizeof(state)); + FIPS202_SHAKE128_Squeeze(state, (unsigned char *)buf, nblocks * SHAKE128_RATE); + + while (ctr < OQS_RLWE_MSRLN16_PARAMETER_N) { + val = (buf[pos] | ((uint16_t)buf[pos + 1] << 8)) & 0x3fff; + if (val < OQS_RLWE_MSRLN16_PARAMETER_Q) { + a[ctr++] = val; + } + pos += 2; + if (pos > SHAKE128_RATE * nblocks - 2) { + nblocks = 1; + FIPS202_SHAKE128_Squeeze(state, (unsigned char *)buf, nblocks * SHAKE128_RATE); + pos = 0; + } + } + + return CRYPTO_SUCCESS; +} + + +CRYPTO_STATUS oqs_rlwe_msrln16_KeyGeneration_A(int32_t *SecretKeyA, unsigned char *PublicKeyA, OQS_RAND *rand) { + // Alice's key generation + // It produces a private key SecretKeyA and computes the public key PublicKeyA. + // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) + // the public key PublicKeyA that occupies 1824 bytes + // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). + uint32_t a[OQS_RLWE_MSRLN16_PARAMETER_N]; + int32_t e[OQS_RLWE_MSRLN16_PARAMETER_N]; + unsigned char seed[OQS_RLWE_MSRLN16_SEED_BYTES]; + CRYPTO_STATUS Status = CRYPTO_ERROR_UNKNOWN; + + rand->rand_n(rand, seed, OQS_RLWE_MSRLN16_SEED_BYTES); + Status = oqs_rlwe_msrln16_generate_a(a, seed); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + + Status = oqs_rlwe_msrln16_get_error(SecretKeyA, rand); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + Status = oqs_rlwe_msrln16_get_error(e, rand); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + oqs_rlwe_msrln16_NTT_CT_std2rev_12289(SecretKeyA, psi_rev_ntt1024_12289, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_NTT_CT_std2rev_12289(e, psi_rev_ntt1024_12289, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_smul(e, 3, OQS_RLWE_MSRLN16_PARAMETER_N); + + oqs_rlwe_msrln16_pmuladd((int32_t *)a, SecretKeyA, e, (int32_t *)a, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_correction((int32_t *)a, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_encode_A(a, seed, PublicKeyA); + +cleanup: + oqs_rlwe_msrln16_clear_words((void *)e, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + + return Status; +} + + +CRYPTO_STATUS oqs_rlwe_msrln16_SecretAgreement_B(unsigned char *PublicKeyA, unsigned char *SharedSecretB, unsigned char *PublicKeyB, OQS_RAND *rand) { + // Bob's key generation and shared secret computation + // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes + // the shared secret SharedSecretB. + // Input: Alice's public key PublicKeyA that consists of 1824 bytes + // Outputs: the public key PublicKeyB that occupies 2048 bytes. + // the 256-bit shared secret SharedSecretB. + // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). + uint32_t pk_A[OQS_RLWE_MSRLN16_PARAMETER_N], a[OQS_RLWE_MSRLN16_PARAMETER_N], v[OQS_RLWE_MSRLN16_PARAMETER_N], r[OQS_RLWE_MSRLN16_PARAMETER_N]; + int32_t sk_B[OQS_RLWE_MSRLN16_PARAMETER_N], e[OQS_RLWE_MSRLN16_PARAMETER_N]; + unsigned char seed[OQS_RLWE_MSRLN16_SEED_BYTES]; + CRYPTO_STATUS Status = CRYPTO_ERROR_UNKNOWN; + + oqs_rlwe_msrln16_decode_A(PublicKeyA, pk_A, seed); + Status = oqs_rlwe_msrln16_generate_a(a, seed); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + + Status = oqs_rlwe_msrln16_get_error(sk_B, rand); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + Status = oqs_rlwe_msrln16_get_error(e, rand); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + oqs_rlwe_msrln16_NTT_CT_std2rev_12289(sk_B, psi_rev_ntt1024_12289, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_NTT_CT_std2rev_12289(e, psi_rev_ntt1024_12289, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_smul(e, 3, OQS_RLWE_MSRLN16_PARAMETER_N); + + oqs_rlwe_msrln16_pmuladd((int32_t *)a, sk_B, e, (int32_t *)a, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_correction((int32_t *)a, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_N); + + Status = oqs_rlwe_msrln16_get_error(e, rand); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + oqs_rlwe_msrln16_NTT_CT_std2rev_12289(e, psi_rev_ntt1024_12289, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_smul(e, 81, OQS_RLWE_MSRLN16_PARAMETER_N); + + oqs_rlwe_msrln16_pmuladd((int32_t *)pk_A, sk_B, e, (int32_t *)v, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_INTT_GS_rev2std_12289((int32_t *)v, omegainv_rev_ntt1024_12289, omegainv10N_rev_ntt1024_12289, Ninv11_ntt1024_12289, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_two_reduce12289((int32_t *)v, OQS_RLWE_MSRLN16_PARAMETER_N); +#if !defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_correction((int32_t *)v, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_N); +#endif + + Status = oqs_rlwe_msrln16_HelpRec(v, r, rand); + if (Status != CRYPTO_SUCCESS) { + goto cleanup; + } + oqs_rlwe_msrln16_Rec(v, r, SharedSecretB); + oqs_rlwe_msrln16_encode_B(a, r, PublicKeyB); + +cleanup: + oqs_rlwe_msrln16_clear_words((void *)sk_B, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + oqs_rlwe_msrln16_clear_words((void *)e, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + oqs_rlwe_msrln16_clear_words((void *)a, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + oqs_rlwe_msrln16_clear_words((void *)v, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + oqs_rlwe_msrln16_clear_words((void *)r, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + + return Status; +} + + +CRYPTO_STATUS oqs_rlwe_msrln16_SecretAgreement_A(unsigned char *PublicKeyB, int32_t *SecretKeyA, unsigned char *SharedSecretA) { + // Alice's shared secret computation + // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA. + // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes + // the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) + // Output: the 256-bit shared secret SharedSecretA. + uint32_t u[OQS_RLWE_MSRLN16_PARAMETER_N], r[OQS_RLWE_MSRLN16_PARAMETER_N]; + CRYPTO_STATUS Status = CRYPTO_SUCCESS; + + oqs_rlwe_msrln16_decode_B(PublicKeyB, u, r); + + oqs_rlwe_msrln16_pmul(SecretKeyA, (int32_t *)u, (int32_t *)u, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_INTT_GS_rev2std_12289((int32_t *)u, omegainv_rev_ntt1024_12289, omegainv10N_rev_ntt1024_12289, Ninv11_ntt1024_12289, OQS_RLWE_MSRLN16_PARAMETER_N); + oqs_rlwe_msrln16_two_reduce12289((int32_t *)u, OQS_RLWE_MSRLN16_PARAMETER_N); +#if !defined(RLWE_ASM_AVX2) + oqs_rlwe_msrln16_correction((int32_t *)u, OQS_RLWE_MSRLN16_PARAMETER_Q, OQS_RLWE_MSRLN16_PARAMETER_N); +#endif + + oqs_rlwe_msrln16_Rec(u, r, SharedSecretA); + +// Cleanup + oqs_rlwe_msrln16_clear_words((void *)u, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + oqs_rlwe_msrln16_clear_words((void *)r, OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(4 * OQS_RLWE_MSRLN16_PARAMETER_N)); + + return Status; +} diff --git a/src/kex_rlwe_msrln16/LatticeCrypto_priv.h b/src/kex_rlwe_msrln16/LatticeCrypto_priv.h new file mode 100644 index 000000000..bdef23dfe --- /dev/null +++ b/src/kex_rlwe_msrln16/LatticeCrypto_priv.h @@ -0,0 +1,122 @@ +/**************************************************************************************** +* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: internal header file +* +*****************************************************************************************/ + +#ifndef __LatticeCrypto_priv_H__ +#define __LatticeCrypto_priv_H__ + + +// For C++ +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "LatticeCrypto.h" + + +// Basic constants +#define OQS_RLWE_MSRLN16_PARAMETER_N 1024 +#define OQS_RLWE_MSRLN16_PARAMETER_Q 12289 +#define OQS_RLWE_MSRLN16_SEED_BYTES 256/8 +#define OQS_RLWE_MSRLN16_PARAMETER_Q4 3073 +#define OQS_RLWE_MSRLN16_PARAMETER_3Q4 9217 +#define OQS_RLWE_MSRLN16_PARAMETER_5Q4 15362 +#define OQS_RLWE_MSRLN16_PARAMETER_7Q4 21506 +#define OQS_RLWE_MSRLN16_PARAMETER_Q2 6145 +#define OQS_RLWE_MSRLN16_PARAMETER_3Q2 18434 + + +// Macro definitions + +#define OQS_RLWE_MSRLN16_NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words +#define OQS_RLWE_MSRLN16_NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words + +// Macro to avoid compiler warnings when detecting unreferenced parameters +#define OQS_RLWE_MSRLN16_UNREFERENCED_PARAMETER(PAR) (PAR) + + +/******************** Function prototypes *******************/ +/******************* Polynomial functions *******************/ + +// Forward NTT +void oqs_rlwe_msrln16_NTT_CT_std2rev_12289(int32_t *a, const int32_t *psi_rev, unsigned int N); +void oqs_rlwe_msrln16_NTT_CT_std2rev_12289_asm(int32_t *a, const int32_t *psi_rev, unsigned int N); + +// Inverse NTT +void oqs_rlwe_msrln16_INTT_GS_rev2std_12289(int32_t *a, const int32_t *omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); +void oqs_rlwe_msrln16_INTT_GS_rev2std_12289_asm(int32_t *a, const int32_t *omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); + +// Reduction modulo q +int32_t oqs_rlwe_msrln16_reduce12289(int64_t a); + +// Two merged reductions modulo q +int32_t oqs_rlwe_msrln16_reduce12289_2x(int64_t a); + +// Two consecutive reductions modulo q +void oqs_rlwe_msrln16_two_reduce12289(int32_t *a, unsigned int N); +void oqs_rlwe_msrln16_two_reduce12289_asm(int32_t *a, unsigned int N); + +// Correction modulo q +void oqs_rlwe_msrln16_correction(int32_t *a, int32_t p, unsigned int N); + +// Component-wise multiplication +void oqs_rlwe_msrln16_pmul(int32_t *a, int32_t *b, int32_t *c, unsigned int N); +void oqs_rlwe_msrln16_pmul_asm(int32_t *a, int32_t *b, int32_t *c, unsigned int N); + +// Component-wise multiplication and addition +void oqs_rlwe_msrln16_pmuladd(int32_t *a, int32_t *b, int32_t *c, int32_t *d, unsigned int N); +void oqs_rlwe_msrln16_pmuladd_asm(int32_t *a, int32_t *b, int32_t *c, int32_t *d, unsigned int N); + +// Component-wise multiplication with scalar +void oqs_rlwe_msrln16_smul(int32_t *a, int32_t scalar, unsigned int N); + +/******************* Key exchange functions *******************/ + +// Alice's message encoding +void oqs_rlwe_msrln16_encode_A(const uint32_t *pk, const unsigned char *seed, unsigned char *m); + +// Alice's message decoding +void oqs_rlwe_msrln16_decode_A(const unsigned char *m, uint32_t *pk, unsigned char *seed); + +// Bob's message encoding +void oqs_rlwe_msrln16_encode_B(const uint32_t *pk, const uint32_t *rvec, unsigned char *m); + +// Bob's message decoding +void oqs_rlwe_msrln16_decode_B(unsigned char *m, uint32_t *pk, uint32_t *rvec); + +// Partial message encoding/decoding (assembly optimized) +void oqs_rlwe_msrln16_encode_asm(const uint32_t *pk, unsigned char *m); +void oqs_rlwe_msrln16_decode_asm(const unsigned char *m, uint32_t *pk); + +// Reconciliation helper +CRYPTO_STATUS oqs_rlwe_msrln16_HelpRec(const uint32_t *x, uint32_t *rvec, OQS_RAND *rand); + +// Partial reconciliation helper (assembly optimized) +void oqs_rlwe_msrln16_helprec_asm(const uint32_t *x, uint32_t *rvec, unsigned char *random_bits); + +// Reconciliation +void oqs_rlwe_msrln16_Rec(const uint32_t *x, const uint32_t *rvec, unsigned char *key); +void oqs_rlwe_msrln16_rec_asm(const uint32_t *x, const uint32_t *rvec, unsigned char *key); + +// Error sampling +CRYPTO_STATUS oqs_rlwe_msrln16_get_error(int32_t *e, OQS_RAND *rand); + +// Partial error sampling (assembly optimized) +void oqs_rlwe_msrln16_error_sampling_asm(unsigned char *stream, int32_t *e); + +// Generation of parameter a +CRYPTO_STATUS oqs_rlwe_msrln16_generate_a(uint32_t *a, const unsigned char *seed); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/src/kex_rlwe_msrln16/License.txt b/src/kex_rlwe_msrln16/License.txt new file mode 100644 index 000000000..4340e43be --- /dev/null +++ b/src/kex_rlwe_msrln16/License.txt @@ -0,0 +1,25 @@ +LatticeCrypto + +Copyright (c) Microsoft Corporation +All rights reserved. + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the ""Software""), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +The library uses the public domain implementation of SHAKE128 by the Keccak team; see the header +of shake128.c for details. diff --git a/src/kex_rlwe_msrln16/README.txt b/src/kex_rlwe_msrln16/README.txt new file mode 100644 index 000000000..6ab71dbec --- /dev/null +++ b/src/kex_rlwe_msrln16/README.txt @@ -0,0 +1,42 @@ + LatticeCrypto v1.0 (C Edition) + ============================== + +LatticeCrypto is a post-quantum secure cryptography library based on the Ring-Learning with Errors (R-LWE) +problem. The version 1.0 of the library implements the instantiation of Peikert's key exchange [1] due to +Alkim, Ducas, Pöppelmann and Schwabe [2], and incorporates novel techniques to provide higher performance. + +The library [3] was developed by Microsoft Research for experimentation purposes. + +*** THE ORIGINAL README HAS BEEN TRIMMED LEAVING ONLY THE INFO RELEVANT FOR THE OQS INTEGRATION *** + +1. CONTENTS: + -------- + +/ - Library C and header files +AMD64/ - Optimized implementation of the NTT for x64 platforms +generic/ - Implementation of the NTT in portable C +README.txt - This readme file + + +2. MAIN FEATURES: + ------------- + +- Support arithmetic functions for computations in power-of-2 cyclotomic rings that are the basis for + implementing Ring-LWE-based cryptographic algorithms. +- Support key exchange providing at least 128 bits of quantum and classical security. +- All functions evaluating secret data have regular, constant-time execution, which provides protection + against timing and cache attacks. +- Basic implementation of the underlying arithmetic functions using portable C to enable support on + a wide range of platforms including x64, x86 and ARM. +- Optional high-performance implementation of the underlying arithmetic functions for x64 platforms on + Linux using assembly and AVX2 vector instructions. + + +REFERENCES +---------- + +[1] C. Peikert, "Lattice cryptography for the internet", in Post-Quantum Cryptography - 6th International + Workshop (PQCrypto 2014), LNCS 8772, pp. 197-219. Springer, 2014. +[2] E. Alkim, L. Ducas, T. Pöppelmann and P. Schwabe, "Post-quantum key exchange - a new hope", IACR Cryp- + tology ePrint Archive, Report 2015/1092, 2015. +[3] https://www.microsoft.com/en-us/research/project/lattice-cryptography-library/ diff --git a/src/kex_rlwe_msrln16/external/shake128.c b/src/kex_rlwe_msrln16/external/shake128.c new file mode 100644 index 000000000..53b9d0e09 --- /dev/null +++ b/src/kex_rlwe_msrln16/external/shake128.c @@ -0,0 +1,332 @@ +#if defined(WINDOWS) +#define UNUSED +#else +#define UNUSED __attribute__ ((unused)) +#endif + +/* +Original implementation modified to allow spliting the absorb and squeeze +phases of Keccak. +*/ + +/* +Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, +Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby +denoted as "the implementer". + +For more information, feedback or questions, please refer to our websites: +http://keccak.noekeon.org/ +http://keyak.noekeon.org/ +http://ketje.noekeon.org/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +/* +================================================================ +The purpose of this source file is to demonstrate a readable and compact +implementation of all the Keccak instances approved in the FIPS 202 standard, +including the hash functions and the extendable-output functions (XOFs). + +We focused on clarity and on source-code compactness, +rather than on the performance. + +The advantages of this implementation are: + + The source code is compact, after removing the comments, that is. :-) + + There are no tables with arbitrary constants. + + For clarity, the comments link the operations to the specifications using + the same notation as much as possible. + + There is no restriction in cryptographic features. In particular, + the SHAKE128 and SHAKE256 XOFs can produce any output length. + + The code does not use much RAM, as all operations are done in place. + +The drawbacks of this implementation are: + - There is no message queue. The whole message must be ready in a buffer. + - It is not optimized for peformance. + +The implementation is even simpler on a little endian platform. Just define the +LITTLE_ENDIAN symbol in that case. + +For a more complete set of implementations, please refer to +the Keccak Code Package at https://github.com/gvanas/KeccakCodePackage + +For more information, please refer to: + * [Keccak Reference] http://keccak.noekeon.org/Keccak-reference-3.0.pdf + * [Keccak Specifications Summary] http://keccak.noekeon.org/specs_summary.html + +This file uses UTF-8 encoding, as some comments use Greek letters. +================================================================ +*/ + +/** + * Function to compute the Keccak[r, c] sponge function over a given input. + * @param rate The value of the rate r. + * @param capacity The value of the capacity c. + * @param input Pointer to the input message. + * @param inputByteLen The number of input bytes provided in the input message. + * @param delimitedSuffix Bits that will be automatically appended to the end + * of the input message, as in domain separation. + * This is a byte containing from 0 to 7 bits + * These n bits must be in the least significant bit positions + * and must be delimited with a bit 1 at position n + * (counting from 0=LSB to 7=MSB) and followed by bits 0 + * from position n+1 to position 7. + * Some examples: + * - If no bits are to be appended, then @a delimitedSuffix must be 0x01. + * - If the 2-bit sequence 0,1 is to be appended (as for SHA3-*), @a delimitedSuffix must be 0x06. + * - If the 4-bit sequence 1,1,1,1 is to be appended (as for SHAKE*), @a delimitedSuffix must be 0x1F. + * - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a delimitedSuffix must be 0x8B. + * @param output Pointer to the buffer where to store the output. + * @param outputByteLen The number of output bytes desired. + * @pre One must have r+c=1600 and the rate a multiple of 8 bits in this implementation. + */ +UNUSED static void Keccak(unsigned int rate, unsigned int capacity, const unsigned char *input, unsigned long long int inputByteLen, unsigned char delimitedSuffix, unsigned char *output, unsigned long long int outputByteLen); + +/* + * Performs the Keccak absorb phase. Same parameters as the Keccak function, but a SHAKE128_STATE_SIZE-byte state must also be provided. + * The Keccak_squeeze function can be called successively to generate output. + */ +static void Keccak_absorb(unsigned int rate, unsigned int capacity, const unsigned char *input, unsigned long long int inputByteLen, unsigned char delimitedSuffix, unsigned char* state, unsigned int stateLen); + +/* + * Performs the Keccak squeeze phase. Same parameters as the Keccak function, but a SHAKE128_STATE_SIZE-byte state must also be provided. + * The Keccak_absorb function must be called first. + */ +static void Keccak_squeeze(unsigned int rate, unsigned int capacity, unsigned char* state, unsigned char *output, unsigned long long int outputByteLen); + +static void FIPS202_SHAKE128_Absorb(const unsigned char *input, unsigned int inputByteLen, unsigned char* state, unsigned int stateLen) +{ + Keccak_absorb(1344, 256, input, inputByteLen, 0x1F, state, stateLen); +} + +static void FIPS202_SHAKE128_Squeeze(unsigned char* state, unsigned char *output, int outputByteLen) +{ + Keccak_squeeze(1344, 256, state, output, outputByteLen); +} + +UNUSED static void FIPS202_SHAKE128(const unsigned char *input, unsigned int inputByteLen, unsigned char *output, int outputByteLen, unsigned int stateLen) +{ + unsigned char state[200] = { 0 }; + FIPS202_SHAKE128_Absorb(input, inputByteLen, state, stateLen); + FIPS202_SHAKE128_Squeeze(state, output, outputByteLen); +} + +/* +================================================================ +Technicalities +================================================================ +*/ + +typedef unsigned char UINT8; +typedef unsigned long long int UINT64; +typedef UINT64 tKeccakLane; + +#ifndef LITTLE_ENDIAN +/** Function to load a 64-bit value using the little-endian (LE) convention. + * On a LE platform, this could be greatly simplified using a cast. + */ +static UINT64 load64(const UINT8 *x) +{ + int i; + UINT64 u=0; + + for(i=7; i>=0; --i) { + u <<= 8; + u |= x[i]; + } + return u; +} + +/** Function to store a 64-bit value using the little-endian (LE) convention. + * On a LE platform, this could be greatly simplified using a cast. + */ +static void store64(UINT8 *x, UINT64 u) +{ + unsigned int i; + + for(i=0; i<8; ++i) { + x[i] = u; + u >>= 8; + } +} + +/** Function to XOR into a 64-bit value using the little-endian (LE) convention. + * On a LE platform, this could be greatly simplified using a cast. + */ +static void xor64(UINT8 *x, UINT64 u) +{ + unsigned int i; + + for(i=0; i<8; ++i) { + x[i] ^= u; + u >>= 8; + } +} +#endif + +/* +================================================================ +A readable and compact implementation of the Keccak-f[1600] permutation. +================================================================ +*/ + +#define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset))) +#define i(x, y) ((x)+5*(y)) + +#ifdef LITTLE_ENDIAN + #define readLane(x, y) (((tKeccakLane*)state)[i(x, y)]) + #define writeLane(x, y, lane) (((tKeccakLane*)state)[i(x, y)]) = (lane) + #define XORLane(x, y, lane) (((tKeccakLane*)state)[i(x, y)]) ^= (lane) +#else + #define readLane(x, y) load64((UINT8*)state+sizeof(tKeccakLane)*i(x, y)) + #define writeLane(x, y, lane) store64((UINT8*)state+sizeof(tKeccakLane)*i(x, y), lane) + #define XORLane(x, y, lane) xor64((UINT8*)state+sizeof(tKeccakLane)*i(x, y), lane) +#endif + +/** + * Function that computes the linear feedback shift register (LFSR) used to + * define the round constants (see [Keccak Reference, Section 1.2]). + */ +static int LFSR86540(UINT8 *LFSR) +{ + int result = ((*LFSR) & 0x01) != 0; + if (((*LFSR) & 0x80) != 0) + // Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1 + (*LFSR) = ((*LFSR) << 1) ^ 0x71; + else + (*LFSR) <<= 1; + return result; +} + +/** + * Function that computes the Keccak-f[1600] permutation on the given state. + */ +static void KeccakF1600_StatePermute(void *state) +{ + unsigned int round, x, y, j, t; + UINT8 LFSRstate = 0x01; + + for(round=0; round<24; round++) { + { // === θ step (see [Keccak Reference, Section 2.3.2]) === + tKeccakLane C[5], D; + + // Compute the parity of the columns + for(x=0; x<5; x++) + C[x] = readLane(x, 0) ^ readLane(x, 1) ^ readLane(x, 2) ^ readLane(x, 3) ^ readLane(x, 4); + for(x=0; x<5; x++) { + // Compute the θ effect for a given column + D = C[(x+4)%5] ^ ROL64(C[(x+1)%5], 1); + // Add the θ effect to the whole column + for (y=0; y<5; y++) + XORLane(x, y, D); + } + } + + { // === ρ and π steps (see [Keccak Reference, Sections 2.3.3 and 2.3.4]) === + tKeccakLane current, temp; + // Start at coordinates (1 0) + x = 1; y = 0; + current = readLane(x, y); + // Iterate over ((0 1)(2 3))^t * (1 0) for 0 ≤ t ≤ 23 + for(t=0; t<24; t++) { + // Compute the rotation constant r = (t+1)(t+2)/2 + unsigned int r = ((t+1)*(t+2)/2)%64; + // Compute ((0 1)(2 3)) * (x y) + unsigned int Y = (2*x+3*y)%5; x = y; y = Y; + // Swap current and state(x,y), and rotate + temp = readLane(x, y); + writeLane(x, y, ROL64(current, r)); + current = temp; + } + } + + { // === χ step (see [Keccak Reference, Section 2.3.1]) === + tKeccakLane temp[5]; + for(y=0; y<5; y++) { + // Take a copy of the plane + for(x=0; x<5; x++) + temp[x] = readLane(x, y); + // Compute χ on the plane + for(x=0; x<5; x++) + writeLane(x, y, temp[x] ^((~temp[(x+1)%5]) & temp[(x+2)%5])); + } + } + + { // === ι step (see [Keccak Reference, Section 2.3.5]) === + for(j=0; j<7; j++) { + unsigned int bitPosition = (1< +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +void Keccak_absorb(unsigned int rate, unsigned int capacity, const unsigned char *input, unsigned long long int inputByteLen, unsigned char delimitedSuffix, unsigned char* state, unsigned int stateLen) +{ + unsigned int rateInBytes = rate/8; + unsigned int blockSize = 0; + unsigned int i; + + if (((rate + capacity) != 1600) || ((rate % 8) != 0)) + return; + + // === Initialize the state === + memset(state, 0, stateLen); + + // === Absorb all the input blocks === + while(inputByteLen > 0) { + blockSize = MIN(inputByteLen, rateInBytes); + for(i=0; i 0) { + blockSize = MIN(outputByteLen, rateInBytes); + memcpy(output, state, blockSize); + output += blockSize; + outputByteLen -= blockSize; + + if (outputByteLen > 0) + KeccakF1600_StatePermute(state); + } +} diff --git a/src/kex_rlwe_msrln16/external/shake128.h b/src/kex_rlwe_msrln16/external/shake128.h new file mode 100644 index 000000000..8181e7266 --- /dev/null +++ b/src/kex_rlwe_msrln16/external/shake128.h @@ -0,0 +1,24 @@ +#ifndef SHAKE128_H +#define SHAKE128_H + +#define SHAKE128_STATE_SIZE 200 +#define SHAKE128_RATE 168 + +/* + * Computes SHAKE128 on the array "input" of length "inputByteLen", resulting in "outputByteLen" bytes stored in "output". + */ +static void FIPS202_SHAKE128(const unsigned char *input, unsigned int inputByteLen, unsigned char *output, int outputByteLen, unsigned int stateLen); + +/* + * Performs the absorb phase of SHAKE128: ingests the "inputByteLen" bytes stored in "input"; storing the internal + * SHAKE128 state of length SHAKE128_STATE_SIZE in "state". + */ +static void FIPS202_SHAKE128_Absorb(const unsigned char *input, unsigned int inputByteLen, unsigned char* state, unsigned int stateLen); + +/* + * Performs the squeeze phase of SHAKE128: generates "outputByteLen" bytes stored in "output" from the "state" of length + * SHAKE128_STATE_SIZE. Must be preceeded by a call to FIPS202_SHAKE128_Absorb. + */ +static void FIPS202_SHAKE128_Squeeze(unsigned char* state, unsigned char *output, int outputByteLen); + +#endif diff --git a/src/kex_rlwe_msrln16/generic/ntt.c b/src/kex_rlwe_msrln16/generic/ntt.c new file mode 100644 index 000000000..e39a95720 --- /dev/null +++ b/src/kex_rlwe_msrln16/generic/ntt.c @@ -0,0 +1,182 @@ +/**************************************************************************************** +* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: NTT functions and other polynomial operations +* +*****************************************************************************************/ + +#include "../LatticeCrypto_priv.h" + +const uint32_t mask12 = ((uint64_t)1 << 12) - 1; + + +int32_t oqs_rlwe_msrln16_reduce12289(int64_t a) +{ // Reduction modulo q + int32_t c0, c1; + + c0 = (int32_t)(a & mask12); + c1 = (int32_t)(a >> 12); + + return (3*c0 - c1); +} + + +int32_t oqs_rlwe_msrln16_reduce12289_2x(int64_t a) +{ // Two merged reductions modulo q + int32_t c0, c1, c2; + + c0 = (int32_t)(a & mask12); + c1 = (int32_t)((a >> 12) & mask12); + c2 = (int32_t)(a >> 24); + + return (9*c0 - 3*c1 + c2); +} + + +void oqs_rlwe_msrln16_NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N) +{ // Forward NTT + unsigned int m, i, j, j1, j2, k = N; + int32_t S, U, V; + + for (m = 1; m < 128; m = 2*m) { + k = k >> 1; + for (i = 0; i < m; i++) { + j1 = 2*i*k; + j2 = j1+k-1; + S = psi_rev[m+i]; + for (j = j1; j <= j2; j++) { + U = a[j]; + V = oqs_rlwe_msrln16_reduce12289((int64_t)a[j+k]*S); + a[j] = U+V; + a[j+k] = U-V; + } + } + } + + k = 4; + for (i = 0; i < 128; i++) { + j1 = 8*i; + j2 = j1+3; + S = psi_rev[i+128]; + for (j = j1; j <= j2; j++) { + U = oqs_rlwe_msrln16_reduce12289((int64_t)a[j]); + V = oqs_rlwe_msrln16_reduce12289_2x((int64_t)a[j+4]*S); + a[j] = U+V; + a[j+4] = U-V; + } + } + + for (m = 256; m < N; m = 2*m) { + k = k >> 1; + for (i = 0; i < m; i++) { + j1 = 2*i*k; + j2 = j1+k-1; + S = psi_rev[m+i]; + for (j = j1; j <= j2; j++) { + U = a[j]; + V = oqs_rlwe_msrln16_reduce12289((int64_t)a[j+k]*S); + a[j] = U+V; + a[j+k] = U-V; + } + } + } + return; +} + + +void oqs_rlwe_msrln16_INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N) +{ // Inverse NTT + unsigned int m, h, i, j, j1, j2, k = 1; + int32_t S, U, V; + int64_t temp; + + for (m = N; m > 2; m >>= 1) { + j1 = 0; + h = m >> 1; + for (i = 0; i < h; i++) { + j2 = j1+k-1; + S = omegainv_rev[h+i]; + for (j = j1; j <= j2; j++) { + U = a[j]; + V = a[j+k]; + a[j] = U+V; + temp = (int64_t)(U-V)*S; + if (m == 32) { + a[j] = oqs_rlwe_msrln16_reduce12289((int64_t)a[j]); + a[j+k] = oqs_rlwe_msrln16_reduce12289_2x(temp); + } else { + a[j+k] = oqs_rlwe_msrln16_reduce12289(temp); + } + } + j1 = j1+2*k; + } + k = 2*k; + } + for (j = 0; j < k; j++) { + U = a[j]; + V = a[j+k]; + a[j] = oqs_rlwe_msrln16_reduce12289((int64_t)(U+V)*Ninv); + a[j+k] = oqs_rlwe_msrln16_reduce12289((int64_t)(U-V)*omegainv1N_rev); + } + return; +} + + +void oqs_rlwe_msrln16_two_reduce12289(int32_t* a, unsigned int N) +{ // Two consecutive reductions modulo q + unsigned int i; + + for (i = 0; i < N; i++) { + a[i] = oqs_rlwe_msrln16_reduce12289((int64_t)a[i]); + a[i] = oqs_rlwe_msrln16_reduce12289((int64_t)a[i]); + } +} + + +void oqs_rlwe_msrln16_pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N) +{ // Component-wise multiplication + unsigned int i; + + for (i = 0; i < N; i++) { + c[i] = oqs_rlwe_msrln16_reduce12289((int64_t)a[i]*b[i]); + c[i] = oqs_rlwe_msrln16_reduce12289((int64_t)c[i]); + } +} + + +void oqs_rlwe_msrln16_pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N) +{ // Component-wise multiplication and addition + unsigned int i; + + for (i = 0; i < N; i++) { + d[i] = oqs_rlwe_msrln16_reduce12289((int64_t)a[i]*b[i] + c[i]); + d[i] = oqs_rlwe_msrln16_reduce12289((int64_t)d[i]); + } +} + + +void oqs_rlwe_msrln16_smul(int32_t* a, int32_t scalar, unsigned int N) +{ // Component-wise multiplication with scalar + unsigned int i; + + for (i = 0; i < N; i++) { + a[i] = a[i]*scalar; + } +} + + +void oqs_rlwe_msrln16_correction(int32_t* a, int32_t p, unsigned int N) +{ // Correction modulo q + unsigned int i; + int32_t mask; + + for (i = 0; i < N; i++) { + mask = a[i] >> (4*sizeof(int32_t) - 1); + a[i] += (p & mask) - p; + mask = a[i] >> (4*sizeof(int32_t) - 1); + a[i] += (p & mask); + } +} diff --git a/src/kex_rlwe_msrln16/kex_rlwe_msrln16.c b/src/kex_rlwe_msrln16/kex_rlwe_msrln16.c new file mode 100644 index 000000000..f5a801d42 --- /dev/null +++ b/src/kex_rlwe_msrln16/kex_rlwe_msrln16.c @@ -0,0 +1,165 @@ +#if defined(WINDOWS) +#define UNUSED +#else +#define UNUSED __attribute__ ((unused)) +#endif + +#include +#include +#if !defined(WINDOWS) +#include +#include +#endif + +#include +#include + +#include "kex_rlwe_msrln16.h" +#include "LatticeCrypto.h" +#include "LatticeCrypto_priv.h" + +OQS_KEX *OQS_KEX_rlwe_msrln16_new(OQS_RAND *rand) { + + OQS_KEX *k = malloc(sizeof(OQS_KEX)); + if (k == NULL) { + return NULL; + } + + k->ctx = NULL; + k->method_name = strdup("RLWE MSR LN16"); + k->estimated_classical_security = 128; + k->estimated_quantum_security = 128; + k->seed = NULL; + k->seed_len = 0; + k->named_parameters = NULL; + k->rand = rand; + k->params = NULL; + k->alice_0 = &OQS_KEX_rlwe_msrln16_alice_0; + k->bob = &OQS_KEX_rlwe_msrln16_bob; + k->alice_1 = &OQS_KEX_rlwe_msrln16_alice_1; + k->alice_priv_free = &OQS_KEX_rlwe_msrln16_alice_priv_free; + k->free = &OQS_KEX_rlwe_msrln16_free; + + return k; +} + +int OQS_KEX_rlwe_msrln16_alice_0(OQS_KEX *k, void **alice_priv, uint8_t **alice_msg, size_t *alice_msg_len) { + + int ret; + + *alice_priv = NULL; + /* alice_msg is alice's public key */ + *alice_msg = NULL; + + *alice_msg = malloc(OQS_RLWE_MSRLN16_PKA_BYTES); + if (*alice_msg == NULL) { + goto err; + } + *alice_priv = malloc(1024 * sizeof(uint32_t)); + if (*alice_priv == NULL) { + goto err; + } + + if (oqs_rlwe_msrln16_KeyGeneration_A((int32_t *) *alice_priv, (unsigned char *) *alice_msg, k->rand) != CRYPTO_SUCCESS) { + goto err; + } + *alice_msg_len = OQS_RLWE_MSRLN16_PKA_BYTES; + + ret = 1; + goto cleanup; + +err: + ret = 0; + free(*alice_msg); + free(*alice_priv); + +cleanup: + return ret; +} + +int OQS_KEX_rlwe_msrln16_bob(OQS_KEX *k, const uint8_t *alice_msg, const size_t alice_msg_len, uint8_t **bob_msg, size_t *bob_msg_len, uint8_t **key, size_t *key_len) { + + int ret; + + *bob_msg = NULL; + *key = NULL; + + if (alice_msg_len != OQS_RLWE_MSRLN16_PKA_BYTES) { + goto err; + } + *bob_msg = malloc(OQS_RLWE_MSRLN16_PKB_BYTES); + if (*bob_msg == NULL) { + goto err; + } + *key = malloc(OQS_RLWE_MSRLN16_SHAREDKEY_BYTES); + if (*key == NULL) { + goto err; + } + + if (oqs_rlwe_msrln16_SecretAgreement_B((unsigned char *) alice_msg, (unsigned char *) *key, (unsigned char *) *bob_msg, k->rand) != CRYPTO_SUCCESS) { + goto err; + } + + *key_len = OQS_RLWE_MSRLN16_SHAREDKEY_BYTES; + *bob_msg_len = OQS_RLWE_MSRLN16_PKB_BYTES; + + ret = 1; + goto cleanup; + +err: + ret = 0; + free(*bob_msg); + free(*key); + +cleanup: + + return ret; +} + +int OQS_KEX_rlwe_msrln16_alice_1(UNUSED OQS_KEX *k, const void *alice_priv, const uint8_t *bob_msg, const size_t bob_msg_len, uint8_t **key, size_t *key_len) { + + int ret; + + *key = NULL; + + if (bob_msg_len != OQS_RLWE_MSRLN16_PKB_BYTES) { + goto err; + } + + *key = malloc(OQS_RLWE_MSRLN16_SHAREDKEY_BYTES); + if (*key == NULL) { + goto err; + } + + if (oqs_rlwe_msrln16_SecretAgreement_A((unsigned char *) bob_msg, (int32_t *) alice_priv, (unsigned char *) *key) != CRYPTO_SUCCESS) { + goto err; + } + + *key_len = OQS_RLWE_MSRLN16_SHAREDKEY_BYTES; + + ret = 1; + goto cleanup; + +err: + ret = 0; + free(*key); + +cleanup: + + return ret; +} + +void OQS_KEX_rlwe_msrln16_alice_priv_free(UNUSED OQS_KEX *k, void *alice_priv) { + if (alice_priv) { + free(alice_priv); + } +} + +void OQS_KEX_rlwe_msrln16_free(OQS_KEX *k) { + if (!k) { + return; + } + free(k->method_name); + k->method_name = NULL; + free(k); +} diff --git a/src/kex_rlwe_msrln16/kex_rlwe_msrln16.h b/src/kex_rlwe_msrln16/kex_rlwe_msrln16.h new file mode 100644 index 000000000..ad1ee4f52 --- /dev/null +++ b/src/kex_rlwe_msrln16/kex_rlwe_msrln16.h @@ -0,0 +1,24 @@ +/** + * \file kex_rlwe_msrln16.h + * \brief Header for ring-LWE key exchange protocol from the Microsoft LatticeCrypto library + */ + +#ifndef __OQS_KEX_RLWE_MSRLN16_H +#define __OQS_KEX_RLWE_MSRLN16_H + +#include +#include + +#include +#include + +OQS_KEX *OQS_KEX_rlwe_msrln16_new(OQS_RAND *rand); + +int OQS_KEX_rlwe_msrln16_alice_0(OQS_KEX *k, void **alice_priv, uint8_t **alice_msg, size_t *alice_msg_len); +int OQS_KEX_rlwe_msrln16_bob(OQS_KEX *k, const uint8_t *alice_msg, const size_t alice_msg_len, uint8_t **bob_msg, size_t *bob_msg_len, uint8_t **key, size_t *key_len); +int OQS_KEX_rlwe_msrln16_alice_1(OQS_KEX *k, const void *alice_priv, const uint8_t *bob_msg, const size_t bob_msg_len, uint8_t **key, size_t *key_len); + +void OQS_KEX_rlwe_msrln16_alice_priv_free(OQS_KEX *k, void *alice_priv); +void OQS_KEX_rlwe_msrln16_free(OQS_KEX *k); + +#endif diff --git a/src/kex_rlwe_msrln16/ntt_constants.c b/src/kex_rlwe_msrln16/ntt_constants.c new file mode 100644 index 000000000..c286ca515 --- /dev/null +++ b/src/kex_rlwe_msrln16/ntt_constants.c @@ -0,0 +1,145 @@ +/**************************************************************************************** +* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: fixed constants for the Number Theoretic Transform (NTT) +* +*****************************************************************************************/ + +#include "LatticeCrypto_priv.h" + + +// N^-1 * prime_scale^-8 +const int32_t Ninv8_ntt1024_12289 = 8350; +// N^-1 * prime_scale^-7 * omegainv_rev_ntt1024_12289[1] +const int32_t omegainv7N_rev_ntt1024_12289 = 795; +// N^-1 * prime_scale^-11 +const int32_t Ninv11_ntt1024_12289 = 2585; +// N^-1 * prime_scale^-10 * omegainv_rev_ntt1024_12289[1] +const int32_t omegainv10N_rev_ntt1024_12289 = 10953; + + +// Index-reversed matrices containing powers of psi (psi_rev_nttxxx_yyy) and inverse powers of omega (omegainv_rev_nttxxx_yyy), +// where xxx is parameter N and yyy is the prime q. + +const int32_t psi_rev_ntt1024_12289[1024] = { + 8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, + 875, 3780, 1607, 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, + 7188, 1067, 2401, 11847, 390, 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, + 3636, 7351, 9585, 6998, 160, 3149, 4437, 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, + 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, + 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, + 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, + 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, + 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, + 11336, 3434, 3529, 2908, 12071, 2361, 1843, 3030, 8174, 6147, 9842, 8326, 576, 10335, 10238, 10484, 9407, 11836, 5908, 418, 3772, 7515, 5429, 7552, 10996, 12133, 2767, 3969, + 8298, 6413, 10008, 2031, 5333, 10800, 9789, 10706, 5942, 1263, 49, 5915, 10806, 11939, 10777, 1815, 5383, 3202, 4493, 6920, 10232, 1975, 8532, 2925, 347, 4754, 1858, 11863, + 8974, 9551, 5868, 9634, 5735, 11566, 12115, 10596, 3009, 6190, 11994, 6523, 652, 3762, 9370, 4016, 4077, 8561, 4049, 5990, 11130, 11143, 948, 325, 1404, 6992, 6119, 8333, + 10929, 1200, 5184, 2555, 6122, 1594, 10327, 7183, 5961, 2692, 12121, 4298, 3329, 5919, 4433, 8455, 7032, 1747, 3123, 3054, 6803, 5782, 10723, 9341, 2503, 683, 2459, 3656, + 64, 4240, 3570, 835, 6065, 4046, 11580, 10970, 3150, 10331, 4322, 2078, 1112, 4079, 11231, 441, 922, 1050, 4536, 6844, 8429, 2683, 11099, 3818, 6171, 8500, 12142, 6833, 4449, + 4749, 6752, 7500, 7822, 8214, 6974, 7965, 7373, 2169, 522, 5079, 3262, 10316, 6715, 1278, 9945, 3514, 11248, 11271, 5925, 468, 3988, 382, 11973, 5339, 6843, 6196, 8579, 2033, + 8291, 1922, 3879, 11035, 973, 6854, 10930, 5206, 6760, 3199, 56, 3565, 654, 1702, 10302, 5862, 6153, 5415, 8646, 11889, 10561, 7341, 6152, 7232, 4698, 8844, 4780, 10240, 4912, + 1321, 12097, 7048, 2920, 3127, 4169, 11502, 3482, 11279, 5468, 5874, 11612, 6055, 8953, 52, 3174, 10966, 9523, 151, 2127, 3957, 2839, 9784, 6383, 1579, 431, 7507, 5886, 3029, + 6695, 4213, 504, 11684, 2302, 8689, 9026, 4624, 6212, 11868, 4080, 6221, 8687, 1003, 8757, 241, 58, 5009, 10333, 885, 6281, 3438, 9445, 11314, 8077, 6608, 3477, 142, 1105, + 8841, 343, 4538, 1908, 1208, 4727, 7078, 10423, 10125, 6873, 11573, 10179, 416, 814, 1705, 2450, 8700, 717, 9307, 1373, 8186, 2429, 10568, 10753, 7228, 11071, 438, 8774, 5993, + 3278, 4209, 6877, 3449, 1136, 3708, 3238, 2926, 1826, 4489, 3171, 8024, 8611, 1928, 464, 3205, 8930, 7080, 1092, 10900, 10221, 11943, 4404, 9126, 4032, 7449, 6127, 8067, 10763, + 125, 540, 8921, 8062, 612, 8051, 12229, 9572, 9089, 10754, 10029, 68, 6453, 7723, 4781, 4924, 1014, 448, 3942, 5232, 1327, 8682, 3744, 7326, 3056, 9761, 5845, 5588, 412, 7187, + 3975, 4883, 3087, 6454, 2257, 7784, 5676, 1417, 8400, 11710, 5596, 5987, 9175, 2769, 5966, 212, 6555, 11113, 5508, 11014, 1125, 4860, 10844, 1131, 4267, 6636, 2275, 9828, 5063, + 4176, 3765, 1518, 8794, 4564, 10224, 5826, 3534, 3961, 4145, 10533, 506, 11034, 6505, 10897, 2674, 10077, 3338, 9013, 3511, 6811, 11111, 2776, 1165, 2575, 8881, 10347, 377, + 4578, 11914, 10669, 10104, 392, 10453, 425, 9489, 193, 2231, 6197, 1038, 11366, 6204, 8122, 2894, 3654, 10975, 10545, 6599, 2455, 11951, 3947, 20, 5002, 5163, 4608, 8946, 8170, + 10138, 1522, 8665, 10397, 3344, 5598, 10964, 6565, 11260, 1945, 11041, 9847, 7174, 4939, 2148, 6330, 3959, 5797, 4913, 3528, 8054, 3825, 8914, 9998, 4335, 8896, 9342, 3982, + 6680, 11653, 7790, 6617, 1737, 622, 10485, 10886, 6195, 7100, 1687, 406, 12143, 5268, 9389, 12050, 994, 7735, 5464, 7383, 4670, 512, 364, 9929, 3028, 5216, 5518, 1226, 7550, + 8038, 7043, 7814, 11053, 3017, 3121, 7584, 2600, 11232, 6780, 12085, 5219, 1409, 9600, 4605, 8151, 12109, 463, 8882, 8308, 10821, 9247, 10945, 9806, 2054, 6203, 6643, 3120, + 6105, 8348, 8536, 6919, 8753, 11007, 8717, 9457, 2021, 9060, 4730, 3929, 10583, 3723, 845, 1936, 7, 5054, 3154, 3285, 4360, 3805, 11522, 2213, 4153, 12239, 12073, 5526, 769, + 4099, 3944, 5604, 5530, 11024, 9282, 2171, 3480, 7434, 8520, 3232, 11996, 9656, 1406, 2945, 5349, 7207, 4590, 11607, 11309, 5202, 844, 7082, 4050, 8016, 9068, 9694, 8452, 7000, + 5662, 567, 2941, 8619, 3808, 4987, 2373, 5135, 63, 7605, 3360, 11839, 10345, 578, 6921, 7628, 510, 5386, 2622, 7806, 5703, 10783, 9224, 11379, 5900, 4719, 11538, 3502, 5789, + 10631, 5618, 826, 5043, 3090, 10891, 9951, 7596, 2293, 11872, 6151, 3469, 4443, 8871, 1555, 1802, 5103, 1891, 1223, 2334, 7878, 1590, 881, 365, 1927, 11274, 4510, 9652, 2946, + 6828, 1280, 614, 10918, 12265, 7250, 6742, 9804, 11385, 2276, 11307, 2593, 879, 7899, 8071, 3454, 8531, 3795, 9021, 5776, 1849, 7766, 7988, 457, 8, 530, 9663, 7785, 11511, 3578, + 7592, 10588, 3466, 8972, 9757, 3332, 139, 2046, 2940, 10808, 9332, 874, 2301, 5650, 12119, 150, 648, 8000, 9982, 9416, 2827, 2434, 11498, 6481, 12268, 9754, 11169, 11823, 11259, + 3821, 10608, 2929, 6263, 4649, 6320, 9687, 10388, 502, 5118, 8496, 6226, 10716, 8443, 7624, 6883, 9269, 6616, 8620, 5287, 944, 7519, 6125, 1882, 11249, 10254, 5410, 1251, 1790, + 5275, 8449, 10447, 4113, 72, 2828, 4352, 7455, 2712, 11048, 7911, 3451, 4094, 6508, 3045, 11194, 2643, 1783, 7211, 4974, 7724, 9811, 9449, 3019, 4194, 2730, 6878, 10421, 2253, + 4518, 9195, 7469, 11129, 9173, 12100, 1763, 2209, 9617, 5170, 865, 1279, 1694, 10759, 8420, 4423, 10555, 3815, 5832, 10939 +}; + + +const int32_t omegainv_rev_ntt1024_12289[1024] = { + 8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, + 6267, 9302, 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, + 2678, 8585, 10752, 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, + 8779, 1630, 10163, 5407, 3186, 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, + 10115, 2847, 4414, 9644, 4053, 7247, 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, + 5331, 8705, 4177, 9764, 10908, 11950, 9821, 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, + 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, + 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, + 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 11184, 12147, 8812, 5681, 4212, 975, 2844, 8851, 6008, 11404, + 1956, 7280, 12231, 12048, 3532, 11286, 3602, 6068, 8209, 421, 6077, 7665, 3263, 3600, 9987, 605, 11785, 8076, 5594, 9260, 6403, 4782, 11858, 10710, 5906, 2505, 9450, 8332, 10162, + 12138, 2766, 1323, 9115, 12237, 3336, 6234, 677, 6415, 6821, 1010, 8807, 787, 8120, 9162, 9369, 5241, 192, 10968, 7377, 2049, 7509, 3445, 7591, 5057, 6137, 4948, 1728, 400, 3643, + 6874, 6136, 6427, 1987, 10587, 11635, 8724, 12233, 9090, 5529, 7083, 1359, 5435, 11316, 1254, 8410, 10367, 3998, 10256, 3710, 6093, 5446, 6950, 316, 11907, 8301, 11821, 6364, 1018, + 1041, 8775, 2344, 11011, 5574, 1973, 9027, 7210, 11767, 10120, 4916, 4324, 5315, 4075, 4467, 4789, 5537, 7540, 7840, 5456, 147, 3789, 6118, 8471, 1190, 9606, 3860, 5445, 7753, 11239, + 11367, 11848, 1058, 8210, 11177, 10211, 7967, 1958, 9139, 1319, 709, 8243, 6224, 11454, 8719, 8049, 12225, 8633, 9830, 11606, 9786, 2948, 1566, 6507, 5486, 9235, 9166, 10542, 5257, + 3834, 7856, 6370, 8960, 7991, 168, 9597, 6328, 5106, 1962, 10695, 6167, 9734, 7105, 11089, 1360, 3956, 6170, 5297, 10885, 11964, 11341, 1146, 1159, 6299, 8240, 3728, 8212, 8273, 2919, + 8527, 11637, 5766, 295, 6099, 9280, 1693, 174, 723, 6554, 2655, 6421, 2738, 3315, 426, 10431, 7535, 11942, 9364, 3757, 10314, 2057, 5369, 7796, 9087, 6906, 10474, 1512, 350, 1483, + 6374, 12240, 11026, 6347, 1583, 2500, 1489, 6956, 10258, 2281, 5876, 3991, 8320, 9522, 156, 1293, 4737, 6860, 4774, 8517, 11871, 6381, 453, 2882, 1805, 2051, 1954, 11713, 3963, 2447, + 6142, 4115, 9259, 10446, 9928, 218, 9381, 8760, 8855, 1350, 6457, 8474, 1734, 7866, 3869, 1530, 10595, 11010, 11424, 7119, 2672, 10080, 10526, 189, 3116, 1160, 4820, 3094, 7771, 10036, + 1868, 5411, 9559, 8095, 9270, 2840, 2478, 4565, 7315, 5078, 10506, 9646, 1095, 9244, 5781, 8195, 8838, 4378, 1241, 9577, 4834, 7937, 9461, 12217, 8176, 1842, 3840, 7014, 10499, 11038, + 6879, 2035, 1040, 10407, 6164, 4770, 11345, 7002, 3669, 5673, 3020, 5406, 4665, 3846, 1573, 6063, 3793, 7171, 11787, 1901, 2602, 5969, 7640, 6026, 9360, 1681, 8468, 1030, 466, 1120, + 2535, 21, 5808, 791, 9855, 9462, 2873, 2307, 4289, 11641, 12139, 170, 6639, 9988, 11415, 2957, 1481, 9349, 10243, 12150, 8957, 2532, 3317, 8823, 1701, 4697, 8711, 778, 4504, 2626, + 11759, 12281, 11832, 4301, 4523, 10440, 6513, 3268, 8494, 3758, 8835, 4218, 4390, 11410, 9696, 982, 10013, 904, 2485, 5547, 5039, 24, 1371, 11675, 11009, 5461, 9343, 2637, 7779, 1015, + 10362, 11924, 11408, 10699, 4411, 9955, 11066, 10398, 7186, 10487, 10734, 3418, 7846, 8820, 6138, 417, 9996, 4693, 2338, 1398, 9199, 7246, 11463, 6671, 1658, 6500, 8787, 751, 7570, + 6389, 910, 3065, 1506, 6586, 4483, 9667, 6903, 11779, 4661, 5368, 11711, 1944, 450, 8929, 4684, 12226, 7154, 9916, 7302, 8481, 3670, 9348, 11722, 6627, 5289, 3837, 2595, 3221, 4273, + 8239, 5207, 11445, 7087, 980, 682, 7699, 5082, 6940, 9344, 10883, 2633, 293, 9057, 3769, 4855, 8809, 10118, 3007, 1265, 6759, 6685, 8345, 8190, 11520, 6763, 216, 50, 8136, 10076, 767, + 8484, 7929, 9004, 9135, 7235, 12282, 10353, 11444, 8566, 1706, 8360, 7559, 3229, 10268, 2832, 3572, 1282, 3536, 5370, 3753, 3941, 6184, 9169, 5646, 6086, 10235, 2483, 1344, 3042, 1468, + 3981, 3407, 11826, 180, 4138, 7684, 2689, 10880, 7070, 204, 5509, 1057, 9689, 4705, 9168, 9272, 1236, 4475, 5246, 4251, 4739, 11063, 6771, 7073, 9261, 2360, 11925, 11777, 7619, 4906, + 6825, 4554, 11295, 239, 2900, 7021, 146, 11883, 10602, 5189, 6094, 1403, 1804, 11667, 10552, 5672, 4499, 636, 5609, 8307, 2947, 3393, 7954, 2291, 3375, 8464, 4235, 8761, 7376, 6492, + 8330, 5959, 10141, 7350, 5115, 2442, 1248, 10344, 1029, 5724, 1325, 6691, 8945, 1892, 3624, 10767, 2151, 4119, 3343, 7681, 7126, 7287, 12269, 8342, 338, 9834, 5690, 1744, 1314, 8635, + 9395, 4167, 6085, 923, 11251, 6092, 10058, 12096, 2800, 11864, 1836, 11897, 2185, 1620, 375, 7711, 11912, 1942, 3408, 9714, 11124, 9513, 1178, 5478, 8778, 3276, 8951, 2212, 9615, 1392, + 5784, 1255, 11783, 1756, 8144, 8328, 8755, 6463, 2065, 7725, 3495, 10771, 8524, 8113, 7226, 2461, 10014, 5653, 8022, 11158, 1445, 7429, 11164, 1275, 6781, 1176, 5734, 12077, 6323, 9520, + 3114, 6302, 6693, 579, 3889, 10872, 6613, 4505, 10032, 5835, 9202, 7406, 8314, 5102, 11877, 6701, 6444, 2528, 9233, 4963, 8545, 3607, 10962, 7057, 8347, 11841, 11275, 7365, 7508, 4566, + 5836, 12221, 2260, 1535, 3200, 2717, 60, 4238, 11677, 4227, 3368, 11749, 12164, 1526, 4222, 6162, 4840, 8257, 3163, 7885, 346, 2068, 1389, 11197, 5209, 3359, 9084, 11825, 10361, 3678, + 4265, 9118, 7800, 10463, 9363, 9051, 8581, 11153, 8840, 5412, 8080, 9011, 6296, 3515, 11851, 1218, 5061, 1536, 1721, 9860, 4103, 10916, 2982, 11572, 3589, 9839, 10584, 11475, 11873, + 2110, 716, 5416, 2164, 1866, 5211, 7562, 11081, 10381, 7751, 11946, 3448 +}; + + +const int32_t psi_rev_ntt512_12289[512] = { + 8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, 875, 3780, 1607, + 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, 7188, 1067, 2401, 11847, 390, + 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, 3636, 7351, 9585, 6998, 160, 3149, 4437, + 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, + 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, + 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, + 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, + 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, + 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, 11336, 8855, 8760, 9381, 218, 9928, 10446, 9259, 4115, 6142, 2447, 3963, 11713, 1954, 2051, 1805, 2882, 453, 6381, 11871, 8517, + 4774, 6860, 4737, 1293, 156, 9522, 8320, 3991, 5876, 2281, 10258, 6956, 1489, 2500, 1583, 6347, 11026, 12240, 6374, 1483, 350, 1512, 10474, 6906, 9087, 7796, 5369, 2057, 10314, 3757, + 9364, 11942, 7535, 10431, 426, 3315, 2738, 6421, 2655, 6554, 723, 174, 1693, 9280, 6099, 295, 5766, 11637, 8527, 2919, 8273, 8212, 3728, 8240, 6299, 1159, 1146, 11341, 11964, 10885, 5297, + 6170, 3956, 1360, 11089, 7105, 9734, 6167, 10695, 1962, 5106, 6328, 9597, 168, 7991, 8960, 6370, 7856, 3834, 5257, 10542, 9166, 9235, 5486, 6507, 1566, 2948, 9786, 11606, 9830, 8633, + 12225, 8049, 8719, 11454, 6224, 8243, 709, 1319, 9139, 1958, 7967, 10211, 11177, 8210, 1058, 11848, 11367, 11239, 7753, 5445, 3860, 9606, 1190, 8471, 6118, 3789, 147, 5456, 7840, 7540, + 5537, 4789, 4467, 4075, 5315, 4324, 4916, 10120, 11767, 7210, 9027, 1973, 5574, 11011, 2344, 8775, 1041, 1018, 6364, 11821, 8301, 11907, 316, 6950, 5446, 6093, 3710, 10256, 3998, 10367, + 8410, 1254, 11316, 5435, 1359, 7083, 5529, 9090, 12233, 8724, 11635, 10587, 1987, 6427, 6136, 6874, 3643, 400, 1728, 4948, 6137, 5057, 7591, 3445, 7509, 2049, 7377, 10968, 192, 5241, 9369, + 9162, 8120, 787, 8807, 1010, 6821, 6415, 677, 6234, 3336, 12237, 9115, 1323, 2766, 12138, 10162, 8332, 9450, 2505, 5906, 10710, 11858, 4782, 6403, 9260, 5594, 8076, 11785, 605, 9987, 3600, + 3263, 7665, 6077, 421, 8209, 6068, 3602, 11286, 3532, 12048, 12231, 7280, 1956, 11404, 6008, 8851, 2844, 975, 4212, 5681, 8812, 12147, 11184 +}; + + +const int32_t omegainv_rev_ntt512_12289[512] = { + 8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, 6267, 9302, + 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, 2678, 8585, 10752, + 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, 8779, 1630, 10163, 5407, 3186, + 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, 10115, 2847, 4414, 9644, 4053, 7247, + 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, 5331, 8705, 4177, 9764, 10908, 11950, 9821, + 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, + 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, + 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, + 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 1105, 142, 3477, 6608, 8077, 11314, 9445, 3438, 6281, 885, 10333, 5009, 58, 241, 8757, 1003, 8687, 6221, 4080, 11868, 6212, 4624, + 9026, 8689, 2302, 11684, 504, 4213, 6695, 3029, 5886, 7507, 431, 1579, 6383, 9784, 2839, 3957, 2127, 151, 9523, 10966, 3174, 52, 8953, 6055, 11612, 5874, 5468, 11279, 3482, 11502, 4169, + 3127, 2920, 7048, 12097, 1321, 4912, 10240, 4780, 8844, 4698, 7232, 6152, 7341, 10561, 11889, 8646, 5415, 6153, 5862, 10302, 1702, 654, 3565, 56, 3199, 6760, 5206, 10930, 6854, 973, 11035, + 3879, 1922, 8291, 2033, 8579, 6196, 6843, 5339, 11973, 382, 3988, 468, 5925, 11271, 11248, 3514, 9945, 1278, 6715, 10316, 3262, 5079, 522, 2169, 7373, 7965, 6974, 8214, 7822, 7500, 6752, + 4749, 4449, 6833, 12142, 8500, 6171, 3818, 11099, 2683, 8429, 6844, 4536, 1050, 922, 441, 11231, 4079, 1112, 2078, 4322, 10331, 3150, 10970, 11580, 4046, 6065, 835, 3570, 4240, 64, 3656, + 2459, 683, 2503, 9341, 10723, 5782, 6803, 3054, 3123, 1747, 7032, 8455, 4433, 5919, 3329, 4298, 12121, 2692, 5961, 7183, 10327, 1594, 6122, 2555, 5184, 1200, 10929, 8333, 6119, 6992, 1404, + 325, 948, 11143, 11130, 5990, 4049, 8561, 4077, 4016, 9370, 3762, 652, 6523, 11994, 6190, 3009, 10596, 12115, 11566, 5735, 9634, 5868, 9551, 8974, 11863, 1858, 4754, 347, 2925, 8532, 1975, + 10232, 6920, 4493, 3202, 5383, 1815, 10777, 11939, 10806, 5915, 49, 1263, 5942, 10706, 9789, 10800, 5333, 2031, 10008, 6413, 8298, 3969, 2767, 12133, 10996, 7552, 5429, 7515, 3772, 418, 5908, + 11836, 9407, 10484, 10238, 10335, 576, 8326, 9842, 6147, 8174, 3030, 1843, 2361, 12071, 2908, 3529, 3434 +};