kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

monocypher.c (100271B)


      1 // Monocypher version 4.0.2
      2 //
      3 // This file is dual-licensed.  Choose whichever licence you want from
      4 // the two licences listed below.
      5 //
      6 // The first licence is a regular 2-clause BSD licence.  The second licence
      7 // is the CC-0 from Creative Commons. It is intended to release Monocypher
      8 // to the public domain.  The BSD licence serves as a fallback option.
      9 //
     10 // SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
     11 //
     12 // ------------------------------------------------------------------------
     13 //
     14 // Copyright (c) 2017-2020, Loup Vaillant
     15 // All rights reserved.
     16 //
     17 //
     18 // Redistribution and use in source and binary forms, with or without
     19 // modification, are permitted provided that the following conditions are
     20 // met:
     21 //
     22 // 1. Redistributions of source code must retain the above copyright
     23 //    notice, this list of conditions and the following disclaimer.
     24 //
     25 // 2. Redistributions in binary form must reproduce the above copyright
     26 //    notice, this list of conditions and the following disclaimer in the
     27 //    documentation and/or other materials provided with the
     28 //    distribution.
     29 //
     30 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     31 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     32 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     33 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     34 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     35 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     36 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     37 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     38 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     39 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     40 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     41 //
     42 // ------------------------------------------------------------------------
     43 //
     44 // Written in 2017-2020 by Loup Vaillant
     45 //
     46 // To the extent possible under law, the author(s) have dedicated all copyright
     47 // and related neighboring rights to this software to the public domain
     48 // worldwide.  This software is distributed without any warranty.
     49 //
     50 // You should have received a copy of the CC0 Public Domain Dedication along
     51 // with this software.  If not, see
     52 // <https://creativecommons.org/publicdomain/zero/1.0/>
     53 
     54 #include "monocypher.h"
     55 
     56 #ifdef MONOCYPHER_CPP_NAMESPACE
     57 namespace MONOCYPHER_CPP_NAMESPACE {
     58 #endif
     59 
     60 /////////////////
     61 /// Utilities ///
     62 /////////////////
     63 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
     64 #define FOR(i, start, end)         FOR_T(size_t, i, start, end)
     65 #define COPY(dst, src, size)       FOR(_i_, 0, size) (dst)[_i_] = (src)[_i_]
     66 #define ZERO(buf, size)            FOR(_i_, 0, size) (buf)[_i_] = 0
     67 #define WIPE_CTX(ctx)              crypto_wipe(ctx   , sizeof(*(ctx)))
     68 #define WIPE_BUFFER(buffer)        crypto_wipe(buffer, sizeof(buffer))
     69 #define MIN(a, b)                  ((a) <= (b) ? (a) : (b))
     70 #define MAX(a, b)                  ((a) >= (b) ? (a) : (b))
     71 
     72 typedef int8_t   i8;
     73 typedef uint8_t  u8;
     74 typedef int16_t  i16;
     75 typedef uint32_t u32;
     76 typedef int32_t  i32;
     77 typedef int64_t  i64;
     78 typedef uint64_t u64;
     79 
     80 static const u8 zero[128] = {0};
     81 
     82 // returns the smallest positive integer y such that
     83 // (x + y) % pow_2  == 0
     84 // Basically, y is the "gap" missing to align x.
     85 // Only works when pow_2 is a power of 2.
     86 // Note: we use ~x+1 instead of -x to avoid compiler warnings
     87 static size_t gap(size_t x, size_t pow_2)
     88 {
     89 	return (~x + 1) & (pow_2 - 1);
     90 }
     91 
     92 static u32 load24_le(const u8 s[3])
     93 {
     94 	return
     95 		((u32)s[0] <<  0) |
     96 		((u32)s[1] <<  8) |
     97 		((u32)s[2] << 16);
     98 }
     99 
    100 static u32 load32_le(const u8 s[4])
    101 {
    102 	return
    103 		((u32)s[0] <<  0) |
    104 		((u32)s[1] <<  8) |
    105 		((u32)s[2] << 16) |
    106 		((u32)s[3] << 24);
    107 }
    108 
    109 static u64 load64_le(const u8 s[8])
    110 {
    111 	return load32_le(s) | ((u64)load32_le(s+4) << 32);
    112 }
    113 
    114 static void store32_le(u8 out[4], u32 in)
    115 {
    116 	out[0] =  in        & 0xff;
    117 	out[1] = (in >>  8) & 0xff;
    118 	out[2] = (in >> 16) & 0xff;
    119 	out[3] = (in >> 24) & 0xff;
    120 }
    121 
    122 static void store64_le(u8 out[8], u64 in)
    123 {
    124 	store32_le(out    , (u32)in );
    125 	store32_le(out + 4, in >> 32);
    126 }
    127 
    128 static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
    129 	FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
    130 }
    131 static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
    132 	FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
    133 }
    134 static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
    135 	FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
    136 }
    137 static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
    138 	FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
    139 }
    140 
    141 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
    142 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
    143 
    144 static int neq0(u64 diff)
    145 {
    146 	// constant time comparison to zero
    147 	// return diff != 0 ? -1 : 0
    148 	u64 half = (diff >> 32) | ((u32)diff);
    149 	return (1 & ((half - 1) >> 32)) - 1;
    150 }
    151 
    152 static u64 x16(const u8 a[16], const u8 b[16])
    153 {
    154 	return (load64_le(a + 0) ^ load64_le(b + 0))
    155 		|  (load64_le(a + 8) ^ load64_le(b + 8));
    156 }
    157 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
    158 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
    159 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
    160 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
    161 int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
    162 
    163 void crypto_wipe(void *secret, size_t size)
    164 {
    165 	volatile u8 *v_secret = (u8*)secret;
    166 	ZERO(v_secret, size);
    167 }
    168 
    169 /////////////////
    170 /// Chacha 20 ///
    171 /////////////////
    172 #define QUARTERROUND(a, b, c, d)	\
    173 	a += b;  d = rotl32(d ^ a, 16); \
    174 	c += d;  b = rotl32(b ^ c, 12); \
    175 	a += b;  d = rotl32(d ^ a,  8); \
    176 	c += d;  b = rotl32(b ^ c,  7)
    177 
    178 static void chacha20_rounds(u32 out[16], const u32 in[16])
    179 {
    180 	// The temporary variables make Chacha20 10% faster.
    181 	u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
    182 	u32 t4  = in[ 4];  u32 t5  = in[ 5];  u32 t6  = in[ 6];  u32 t7  = in[ 7];
    183 	u32 t8  = in[ 8];  u32 t9  = in[ 9];  u32 t10 = in[10];  u32 t11 = in[11];
    184 	u32 t12 = in[12];  u32 t13 = in[13];  u32 t14 = in[14];  u32 t15 = in[15];
    185 
    186 	FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
    187 		QUARTERROUND(t0, t4, t8 , t12); // column 0
    188 		QUARTERROUND(t1, t5, t9 , t13); // column 1
    189 		QUARTERROUND(t2, t6, t10, t14); // column 2
    190 		QUARTERROUND(t3, t7, t11, t15); // column 3
    191 		QUARTERROUND(t0, t5, t10, t15); // diagonal 0
    192 		QUARTERROUND(t1, t6, t11, t12); // diagonal 1
    193 		QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
    194 		QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
    195 	}
    196 	out[ 0] = t0;   out[ 1] = t1;   out[ 2] = t2;   out[ 3] = t3;
    197 	out[ 4] = t4;   out[ 5] = t5;   out[ 6] = t6;   out[ 7] = t7;
    198 	out[ 8] = t8;   out[ 9] = t9;   out[10] = t10;  out[11] = t11;
    199 	out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
    200 }
    201 
    202 static const u8 *chacha20_constant = (const u8*)"expand 32-byte k"; // 16 bytes
    203 
    204 void crypto_chacha20_h(u8 out[32], const u8 key[32], const u8 in [16])
    205 {
    206 	u32 block[16];
    207 	load32_le_buf(block     , chacha20_constant, 4);
    208 	load32_le_buf(block +  4, key              , 8);
    209 	load32_le_buf(block + 12, in               , 4);
    210 
    211 	chacha20_rounds(block, block);
    212 
    213 	// prevent reversal of the rounds by revealing only half of the buffer.
    214 	store32_le_buf(out   , block   , 4); // constant
    215 	store32_le_buf(out+16, block+12, 4); // counter and nonce
    216 	WIPE_BUFFER(block);
    217 }
    218 
    219 u64 crypto_chacha20_djb(u8 *cipher_text, const u8 *plain_text,
    220                         size_t text_size, const u8 key[32], const u8 nonce[8],
    221                         u64 ctr)
    222 {
    223 	u32 input[16];
    224 	load32_le_buf(input     , chacha20_constant, 4);
    225 	load32_le_buf(input +  4, key              , 8);
    226 	load32_le_buf(input + 14, nonce            , 2);
    227 	input[12] = (u32) ctr;
    228 	input[13] = (u32)(ctr >> 32);
    229 
    230 	// Whole blocks
    231 	u32    pool[16];
    232 	size_t nb_blocks = text_size >> 6;
    233 	FOR (i, 0, nb_blocks) {
    234 		chacha20_rounds(pool, input);
    235 		if (plain_text != 0) {
    236 			FOR (j, 0, 16) {
    237 				u32 p = pool[j] + input[j];
    238 				store32_le(cipher_text, p ^ load32_le(plain_text));
    239 				cipher_text += 4;
    240 				plain_text  += 4;
    241 			}
    242 		} else {
    243 			FOR (j, 0, 16) {
    244 				u32 p = pool[j] + input[j];
    245 				store32_le(cipher_text, p);
    246 				cipher_text += 4;
    247 			}
    248 		}
    249 		input[12]++;
    250 		if (input[12] == 0) {
    251 			input[13]++;
    252 		}
    253 	}
    254 	text_size &= 63;
    255 
    256 	// Last (incomplete) block
    257 	if (text_size > 0) {
    258 		if (plain_text == 0) {
    259 			plain_text = zero;
    260 		}
    261 		chacha20_rounds(pool, input);
    262 		u8 tmp[64];
    263 		FOR (i, 0, 16) {
    264 			store32_le(tmp + i*4, pool[i] + input[i]);
    265 		}
    266 		FOR (i, 0, text_size) {
    267 			cipher_text[i] = tmp[i] ^ plain_text[i];
    268 		}
    269 		WIPE_BUFFER(tmp);
    270 	}
    271 	ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
    272 
    273 	WIPE_BUFFER(pool);
    274 	WIPE_BUFFER(input);
    275 	return ctr;
    276 }
    277 
    278 u32 crypto_chacha20_ietf(u8 *cipher_text, const u8 *plain_text,
    279                          size_t text_size,
    280                          const u8 key[32], const u8 nonce[12], u32 ctr)
    281 {
    282 	u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
    283 	return (u32)crypto_chacha20_djb(cipher_text, plain_text, text_size,
    284 	                                key, nonce + 4, big_ctr);
    285 }
    286 
    287 u64 crypto_chacha20_x(u8 *cipher_text, const u8 *plain_text,
    288                       size_t text_size,
    289                       const u8 key[32], const u8 nonce[24], u64 ctr)
    290 {
    291 	u8 sub_key[32];
    292 	crypto_chacha20_h(sub_key, key, nonce);
    293 	ctr = crypto_chacha20_djb(cipher_text, plain_text, text_size,
    294 	                          sub_key, nonce + 16, ctr);
    295 	WIPE_BUFFER(sub_key);
    296 	return ctr;
    297 }
    298 
    299 /////////////////
    300 /// Poly 1305 ///
    301 /////////////////
    302 
    303 // h = (h + c) * r
    304 // preconditions:
    305 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
    306 //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
    307 //   end    <= 1
    308 // Postcondition:
    309 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
    310 static void poly_blocks(crypto_poly1305_ctx *ctx, const u8 *in,
    311                         size_t nb_blocks, unsigned end)
    312 {
    313 	// Local all the things!
    314 	const u32 r0 = ctx->r[0];
    315 	const u32 r1 = ctx->r[1];
    316 	const u32 r2 = ctx->r[2];
    317 	const u32 r3 = ctx->r[3];
    318 	const u32 rr0 = (r0 >> 2) * 5;  // lose 2 bits...
    319 	const u32 rr1 = (r1 >> 2) + r1; // rr1 == (r1 >> 2) * 5
    320 	const u32 rr2 = (r2 >> 2) + r2; // rr1 == (r2 >> 2) * 5
    321 	const u32 rr3 = (r3 >> 2) + r3; // rr1 == (r3 >> 2) * 5
    322 	const u32 rr4 = r0 & 3;         // ...recover 2 bits
    323 	u32 h0 = ctx->h[0];
    324 	u32 h1 = ctx->h[1];
    325 	u32 h2 = ctx->h[2];
    326 	u32 h3 = ctx->h[3];
    327 	u32 h4 = ctx->h[4];
    328 
    329 	FOR (i, 0, nb_blocks) {
    330 		// h + c, without carry propagation
    331 		const u64 s0 = (u64)h0 + load32_le(in);  in += 4;
    332 		const u64 s1 = (u64)h1 + load32_le(in);  in += 4;
    333 		const u64 s2 = (u64)h2 + load32_le(in);  in += 4;
    334 		const u64 s3 = (u64)h3 + load32_le(in);  in += 4;
    335 		const u32 s4 =      h4 + end;
    336 
    337 		// (h + c) * r, without carry propagation
    338 		const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0;
    339 		const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1;
    340 		const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2;
    341 		const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3;
    342 		const u32 x4 =                                s4*rr4;
    343 
    344 		// partial reduction modulo 2^130 - 5
    345 		const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
    346 		const u64 u0 = (u5 >>  2) * 5 + (x0 & 0xffffffff);
    347 		const u64 u1 = (u0 >> 32)     + (x1 & 0xffffffff) + (x0 >> 32);
    348 		const u64 u2 = (u1 >> 32)     + (x2 & 0xffffffff) + (x1 >> 32);
    349 		const u64 u3 = (u2 >> 32)     + (x3 & 0xffffffff) + (x2 >> 32);
    350 		const u32 u4 = (u3 >> 32)     + (u5 & 3); // u4 <= 4
    351 
    352 		// Update the hash
    353 		h0 = u0 & 0xffffffff;
    354 		h1 = u1 & 0xffffffff;
    355 		h2 = u2 & 0xffffffff;
    356 		h3 = u3 & 0xffffffff;
    357 		h4 = u4;
    358 	}
    359 	ctx->h[0] = h0;
    360 	ctx->h[1] = h1;
    361 	ctx->h[2] = h2;
    362 	ctx->h[3] = h3;
    363 	ctx->h[4] = h4;
    364 }
    365 
    366 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
    367 {
    368 	ZERO(ctx->h, 5); // Initial hash is zero
    369 	ctx->c_idx = 0;
    370 	// load r and pad (r has some of its bits cleared)
    371 	load32_le_buf(ctx->r  , key   , 4);
    372 	load32_le_buf(ctx->pad, key+16, 4);
    373 	FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
    374 	FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
    375 }
    376 
    377 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
    378                             const u8 *message, size_t message_size)
    379 {
    380 	// Avoid undefined NULL pointer increments with empty messages
    381 	if (message_size == 0) {
    382 		return;
    383 	}
    384 
    385 	// Align ourselves with block boundaries
    386 	size_t aligned = MIN(gap(ctx->c_idx, 16), message_size);
    387 	FOR (i, 0, aligned) {
    388 		ctx->c[ctx->c_idx] = *message;
    389 		ctx->c_idx++;
    390 		message++;
    391 		message_size--;
    392 	}
    393 
    394 	// If block is complete, process it
    395 	if (ctx->c_idx == 16) {
    396 		poly_blocks(ctx, ctx->c, 1, 1);
    397 		ctx->c_idx = 0;
    398 	}
    399 
    400 	// Process the message block by block
    401 	size_t nb_blocks = message_size >> 4;
    402 	poly_blocks(ctx, message, nb_blocks, 1);
    403 	message      += nb_blocks << 4;
    404 	message_size &= 15;
    405 
    406 	// remaining bytes (we never complete a block here)
    407 	FOR (i, 0, message_size) {
    408 		ctx->c[ctx->c_idx] = message[i];
    409 		ctx->c_idx++;
    410 	}
    411 }
    412 
    413 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
    414 {
    415 	// Process the last block (if any)
    416 	// We move the final 1 according to remaining input length
    417 	// (this will add less than 2^130 to the last input block)
    418 	if (ctx->c_idx != 0) {
    419 		ZERO(ctx->c + ctx->c_idx, 16 - ctx->c_idx);
    420 		ctx->c[ctx->c_idx] = 1;
    421 		poly_blocks(ctx, ctx->c, 1, 0);
    422 	}
    423 
    424 	// check if we should subtract 2^130-5 by performing the
    425 	// corresponding carry propagation.
    426 	u64 c = 5;
    427 	FOR (i, 0, 4) {
    428 		c  += ctx->h[i];
    429 		c >>= 32;
    430 	}
    431 	c += ctx->h[4];
    432 	c  = (c >> 2) * 5; // shift the carry back to the beginning
    433 	// c now indicates how many times we should subtract 2^130-5 (0 or 1)
    434 	FOR (i, 0, 4) {
    435 		c += (u64)ctx->h[i] + ctx->pad[i];
    436 		store32_le(mac + i*4, (u32)c);
    437 		c = c >> 32;
    438 	}
    439 	WIPE_CTX(ctx);
    440 }
    441 
    442 void crypto_poly1305(u8     mac[16],  const u8 *message,
    443                      size_t message_size, const u8  key[32])
    444 {
    445 	crypto_poly1305_ctx ctx;
    446 	crypto_poly1305_init  (&ctx, key);
    447 	crypto_poly1305_update(&ctx, message, message_size);
    448 	crypto_poly1305_final (&ctx, mac);
    449 }
    450 
    451 ////////////////
    452 /// BLAKE2 b ///
    453 ////////////////
    454 static const u64 iv[8] = {
    455 	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
    456 	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
    457 	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
    458 	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
    459 };
    460 
    461 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
    462 {
    463 	static const u8 sigma[12][16] = {
    464 		{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
    465 		{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
    466 		{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
    467 		{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
    468 		{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
    469 		{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
    470 		{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
    471 		{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
    472 		{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
    473 		{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
    474 		{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
    475 		{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
    476 	};
    477 
    478 	// increment input offset
    479 	u64   *x = ctx->input_offset;
    480 	size_t y = ctx->input_idx;
    481 	x[0] += y;
    482 	if (x[0] < y) {
    483 		x[1]++;
    484 	}
    485 
    486 	// init work vector
    487 	u64 v0 = ctx->hash[0];  u64 v8  = iv[0];
    488 	u64 v1 = ctx->hash[1];  u64 v9  = iv[1];
    489 	u64 v2 = ctx->hash[2];  u64 v10 = iv[2];
    490 	u64 v3 = ctx->hash[3];  u64 v11 = iv[3];
    491 	u64 v4 = ctx->hash[4];  u64 v12 = iv[4] ^ ctx->input_offset[0];
    492 	u64 v5 = ctx->hash[5];  u64 v13 = iv[5] ^ ctx->input_offset[1];
    493 	u64 v6 = ctx->hash[6];  u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
    494 	u64 v7 = ctx->hash[7];  u64 v15 = iv[7];
    495 
    496 	// mangle work vector
    497 	u64 *input = ctx->input;
    498 #define BLAKE2_G(a, b, c, d, x, y)	\
    499 	a += b + x;  d = rotr64(d ^ a, 32); \
    500 	c += d;      b = rotr64(b ^ c, 24); \
    501 	a += b + y;  d = rotr64(d ^ a, 16); \
    502 	c += d;      b = rotr64(b ^ c, 63)
    503 #define BLAKE2_ROUND(i)	\
    504 	BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
    505 	BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
    506 	BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
    507 	BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
    508 	BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
    509 	BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
    510 	BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
    511 	BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
    512 
    513 #ifdef BLAKE2_NO_UNROLLING
    514 	FOR (i, 0, 12) {
    515 		BLAKE2_ROUND(i);
    516 	}
    517 #else
    518 	BLAKE2_ROUND(0);  BLAKE2_ROUND(1);  BLAKE2_ROUND(2);  BLAKE2_ROUND(3);
    519 	BLAKE2_ROUND(4);  BLAKE2_ROUND(5);  BLAKE2_ROUND(6);  BLAKE2_ROUND(7);
    520 	BLAKE2_ROUND(8);  BLAKE2_ROUND(9);  BLAKE2_ROUND(10); BLAKE2_ROUND(11);
    521 #endif
    522 
    523 	// update hash
    524 	ctx->hash[0] ^= v0 ^ v8;   ctx->hash[1] ^= v1 ^ v9;
    525 	ctx->hash[2] ^= v2 ^ v10;  ctx->hash[3] ^= v3 ^ v11;
    526 	ctx->hash[4] ^= v4 ^ v12;  ctx->hash[5] ^= v5 ^ v13;
    527 	ctx->hash[6] ^= v6 ^ v14;  ctx->hash[7] ^= v7 ^ v15;
    528 }
    529 
    530 void crypto_blake2b_keyed_init(crypto_blake2b_ctx *ctx, size_t hash_size,
    531                                const u8 *key, size_t key_size)
    532 {
    533 	// initial hash
    534 	COPY(ctx->hash, iv, 8);
    535 	ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
    536 
    537 	ctx->input_offset[0] = 0;  // beginning of the input, no offset
    538 	ctx->input_offset[1] = 0;  // beginning of the input, no offset
    539 	ctx->hash_size       = hash_size;
    540 	ctx->input_idx       = 0;
    541 	ZERO(ctx->input, 16);
    542 
    543 	// if there is a key, the first block is that key (padded with zeroes)
    544 	if (key_size > 0) {
    545 		u8 key_block[128] = {0};
    546 		COPY(key_block, key, key_size);
    547 		// same as calling crypto_blake2b_update(ctx, key_block , 128)
    548 		load64_le_buf(ctx->input, key_block, 16);
    549 		ctx->input_idx = 128;
    550 	}
    551 }
    552 
    553 void crypto_blake2b_init(crypto_blake2b_ctx *ctx, size_t hash_size)
    554 {
    555 	crypto_blake2b_keyed_init(ctx, hash_size, 0, 0);
    556 }
    557 
    558 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
    559                            const u8 *message, size_t message_size)
    560 {
    561 	// Avoid undefined NULL pointer increments with empty messages
    562 	if (message_size == 0) {
    563 		return;
    564 	}
    565 
    566 	// Align with word boundaries
    567 	if ((ctx->input_idx & 7) != 0) {
    568 		size_t nb_bytes = MIN(gap(ctx->input_idx, 8), message_size);
    569 		size_t word     = ctx->input_idx >> 3;
    570 		size_t byte     = ctx->input_idx & 7;
    571 		FOR (i, 0, nb_bytes) {
    572 			ctx->input[word] |= (u64)message[i] << ((byte + i) << 3);
    573 		}
    574 		ctx->input_idx += nb_bytes;
    575 		message        += nb_bytes;
    576 		message_size   -= nb_bytes;
    577 	}
    578 
    579 	// Align with block boundaries (faster than byte by byte)
    580 	if ((ctx->input_idx & 127) != 0) {
    581 		size_t nb_words = MIN(gap(ctx->input_idx, 128), message_size) >> 3;
    582 		load64_le_buf(ctx->input + (ctx->input_idx >> 3), message, nb_words);
    583 		ctx->input_idx += nb_words << 3;
    584 		message        += nb_words << 3;
    585 		message_size   -= nb_words << 3;
    586 	}
    587 
    588 	// Process block by block
    589 	size_t nb_blocks = message_size >> 7;
    590 	FOR (i, 0, nb_blocks) {
    591 		if (ctx->input_idx == 128) {
    592 			blake2b_compress(ctx, 0);
    593 		}
    594 		load64_le_buf(ctx->input, message, 16);
    595 		message += 128;
    596 		ctx->input_idx = 128;
    597 	}
    598 	message_size &= 127;
    599 
    600 	if (message_size != 0) {
    601 		// Compress block & flush input buffer as needed
    602 		if (ctx->input_idx == 128) {
    603 			blake2b_compress(ctx, 0);
    604 			ctx->input_idx = 0;
    605 		}
    606 		if (ctx->input_idx == 0) {
    607 			ZERO(ctx->input, 16);
    608 		}
    609 		// Fill remaining words (faster than byte by byte)
    610 		size_t nb_words = message_size >> 3;
    611 		load64_le_buf(ctx->input, message, nb_words);
    612 		ctx->input_idx += nb_words << 3;
    613 		message        += nb_words << 3;
    614 		message_size   -= nb_words << 3;
    615 
    616 		// Fill remaining bytes
    617 		FOR (i, 0, message_size) {
    618 			size_t word = ctx->input_idx >> 3;
    619 			size_t byte = ctx->input_idx & 7;
    620 			ctx->input[word] |= (u64)message[i] << (byte << 3);
    621 			ctx->input_idx++;
    622 		}
    623 	}
    624 }
    625 
    626 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
    627 {
    628 	blake2b_compress(ctx, 1); // compress the last block
    629 	size_t hash_size = MIN(ctx->hash_size, 64);
    630 	size_t nb_words  = hash_size >> 3;
    631 	store64_le_buf(hash, ctx->hash, nb_words);
    632 	FOR (i, nb_words << 3, hash_size) {
    633 		hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
    634 	}
    635 	WIPE_CTX(ctx);
    636 }
    637 
    638 void crypto_blake2b_keyed(u8 *hash,          size_t hash_size,
    639                           const u8 *key,     size_t key_size,
    640                           const u8 *message, size_t message_size)
    641 {
    642 	crypto_blake2b_ctx ctx;
    643 	crypto_blake2b_keyed_init(&ctx, hash_size, key, key_size);
    644 	crypto_blake2b_update    (&ctx, message, message_size);
    645 	crypto_blake2b_final     (&ctx, hash);
    646 }
    647 
    648 void crypto_blake2b(u8 *hash, size_t hash_size, const u8 *msg, size_t msg_size)
    649 {
    650 	crypto_blake2b_keyed(hash, hash_size, 0, 0, msg, msg_size);
    651 }
    652 
    653 //////////////
    654 /// Argon2 ///
    655 //////////////
    656 // references to R, Z, Q etc. come from the spec
    657 
    658 // Argon2 operates on 1024 byte blocks.
    659 typedef struct { u64 a[128]; } blk;
    660 
    661 // updates a BLAKE2 hash with a 32 bit word, little endian.
    662 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
    663 {
    664 	u8 buf[4];
    665 	store32_le(buf, input);
    666 	crypto_blake2b_update(ctx, buf, 4);
    667 	WIPE_BUFFER(buf);
    668 }
    669 
    670 static void blake_update_32_buf(crypto_blake2b_ctx *ctx,
    671                                 const u8 *buf, u32 size)
    672 {
    673 	blake_update_32(ctx, size);
    674 	crypto_blake2b_update(ctx, buf, size);
    675 }
    676 
    677 
    678 static void copy_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i]  = in->a[i];}
    679 static void  xor_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i] ^= in->a[i];}
    680 
    681 // Hash with a virtually unlimited digest size.
    682 // Doesn't extract more entropy than the base hash function.
    683 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
    684 // (One could use a stream cipher with a seed hash as the key, but
    685 //  this would introduce another dependency —and point of failure.)
    686 static void extended_hash(u8       *digest, u32 digest_size,
    687                           const u8 *input , u32 input_size)
    688 {
    689 	crypto_blake2b_ctx ctx;
    690 	crypto_blake2b_init  (&ctx, MIN(digest_size, 64));
    691 	blake_update_32      (&ctx, digest_size);
    692 	crypto_blake2b_update(&ctx, input, input_size);
    693 	crypto_blake2b_final (&ctx, digest);
    694 
    695 	if (digest_size > 64) {
    696 		// the conversion to u64 avoids integer overflow on
    697 		// ludicrously big hash sizes.
    698 		u32 r   = (u32)(((u64)digest_size + 31) >> 5) - 2;
    699 		u32 i   =  1;
    700 		u32 in  =  0;
    701 		u32 out = 32;
    702 		while (i < r) {
    703 			// Input and output overlap. This is intentional
    704 			crypto_blake2b(digest + out, 64, digest + in, 64);
    705 			i   +=  1;
    706 			in  += 32;
    707 			out += 32;
    708 		}
    709 		crypto_blake2b(digest + out, digest_size - (32 * r), digest + in , 64);
    710 	}
    711 }
    712 
    713 #define LSB(x) ((u64)(u32)x)
    714 #define G(a, b, c, d)	\
    715 	a += b + ((LSB(a) * LSB(b)) << 1);  d ^= a;  d = rotr64(d, 32); \
    716 	c += d + ((LSB(c) * LSB(d)) << 1);  b ^= c;  b = rotr64(b, 24); \
    717 	a += b + ((LSB(a) * LSB(b)) << 1);  d ^= a;  d = rotr64(d, 16); \
    718 	c += d + ((LSB(c) * LSB(d)) << 1);  b ^= c;  b = rotr64(b, 63)
    719 #define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,	\
    720               v8,  v9, v10, v11, v12, v13, v14, v15)	\
    721 	G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13); \
    722 	G(v2, v6, v10, v14);  G(v3, v7, v11, v15); \
    723 	G(v0, v5, v10, v15);  G(v1, v6, v11, v12); \
    724 	G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
    725 
    726 // Core of the compression function G.  Computes Z from R in place.
    727 static void g_rounds(blk *b)
    728 {
    729 	// column rounds (work_block = Q)
    730 	for (int i = 0; i < 128; i += 16) {
    731 		ROUND(b->a[i   ], b->a[i+ 1], b->a[i+ 2], b->a[i+ 3],
    732 		      b->a[i+ 4], b->a[i+ 5], b->a[i+ 6], b->a[i+ 7],
    733 		      b->a[i+ 8], b->a[i+ 9], b->a[i+10], b->a[i+11],
    734 		      b->a[i+12], b->a[i+13], b->a[i+14], b->a[i+15]);
    735 	}
    736 	// row rounds (b = Z)
    737 	for (int i = 0; i < 16; i += 2) {
    738 		ROUND(b->a[i   ], b->a[i+ 1], b->a[i+ 16], b->a[i+ 17],
    739 		      b->a[i+32], b->a[i+33], b->a[i+ 48], b->a[i+ 49],
    740 		      b->a[i+64], b->a[i+65], b->a[i+ 80], b->a[i+ 81],
    741 		      b->a[i+96], b->a[i+97], b->a[i+112], b->a[i+113]);
    742 	}
    743 }
    744 
    745 const crypto_argon2_extras crypto_argon2_no_extras = { 0, 0, 0, 0 };
    746 
    747 void crypto_argon2(u8 *hash, u32 hash_size, void *work_area,
    748                    crypto_argon2_config config,
    749                    crypto_argon2_inputs inputs,
    750                    crypto_argon2_extras extras)
    751 {
    752 	const u32 segment_size = config.nb_blocks / config.nb_lanes / 4;
    753 	const u32 lane_size    = segment_size * 4;
    754 	const u32 nb_blocks    = lane_size * config.nb_lanes; // rounding down
    755 
    756 	// work area seen as blocks (must be suitably aligned)
    757 	blk *blocks = (blk*)work_area;
    758 	{
    759 		u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
    760 		crypto_blake2b_ctx ctx;
    761 		crypto_blake2b_init (&ctx, 64);
    762 		blake_update_32     (&ctx, config.nb_lanes ); // p: number of "threads"
    763 		blake_update_32     (&ctx, hash_size);
    764 		blake_update_32     (&ctx, config.nb_blocks);
    765 		blake_update_32     (&ctx, config.nb_passes);
    766 		blake_update_32     (&ctx, 0x13);             // v: version number
    767 		blake_update_32     (&ctx, config.algorithm); // y: Argon2i, Argon2d...
    768 		blake_update_32_buf (&ctx, inputs.pass, inputs.pass_size);
    769 		blake_update_32_buf (&ctx, inputs.salt, inputs.salt_size);
    770 		blake_update_32_buf (&ctx, extras.key,  extras.key_size);
    771 		blake_update_32_buf (&ctx, extras.ad,   extras.ad_size);
    772 		crypto_blake2b_final(&ctx, initial_hash); // fill 64 first bytes only
    773 
    774 		// fill first 2 blocks of each lane
    775 		u8 hash_area[1024];
    776 		FOR_T(u32, l, 0, config.nb_lanes) {
    777 			FOR_T(u32, i, 0, 2) {
    778 				store32_le(initial_hash + 64, i); // first  additional word
    779 				store32_le(initial_hash + 68, l); // second additional word
    780 				extended_hash(hash_area, 1024, initial_hash, 72);
    781 				load64_le_buf(blocks[l * lane_size + i].a, hash_area, 128);
    782 			}
    783 		}
    784 
    785 		WIPE_BUFFER(initial_hash);
    786 		WIPE_BUFFER(hash_area);
    787 	}
    788 
    789 	// Argon2i and Argon2id start with constant time indexing
    790 	int constant_time = config.algorithm != CRYPTO_ARGON2_D;
    791 
    792 	// Fill (and re-fill) the rest of the blocks
    793 	//
    794 	// Note: even though each segment within the same slice can be
    795 	// computed in parallel, (one thread per lane), we are computing
    796 	// them sequentially, because Monocypher doesn't support threads.
    797 	//
    798 	// Yet optimal performance (and therefore security) requires one
    799 	// thread per lane. The only reason Monocypher supports multiple
    800 	// lanes is compatibility.
    801 	blk tmp;
    802 	FOR_T(u32, pass, 0, config.nb_passes) {
    803 		FOR_T(u32, slice, 0, 4) {
    804 			// On the first slice of the first pass,
    805 			// blocks 0 and 1 are already filled, hence pass_offset.
    806 			u32 pass_offset  = pass == 0 && slice == 0 ? 2 : 0;
    807 			u32 slice_offset = slice * segment_size;
    808 
    809 			// Argon2id switches back to non-constant time indexing
    810 			// after the first two slices of the first pass
    811 			if (slice == 2 && config.algorithm == CRYPTO_ARGON2_ID) {
    812 				constant_time = 0;
    813 			}
    814 
    815 			// Each iteration of the following loop may be performed in
    816 			// a separate thread.  All segments must be fully completed
    817 			// before we start filling the next slice.
    818 			FOR_T(u32, segment, 0, config.nb_lanes) {
    819 				blk index_block;
    820 				u32 index_ctr = 1;
    821 				FOR_T (u32, block, pass_offset, segment_size) {
    822 					// Current and previous blocks
    823 					u32  lane_offset   = segment * lane_size;
    824 					blk *segment_start = blocks + lane_offset + slice_offset;
    825 					blk *current       = segment_start + block;
    826 					blk *previous      =
    827 						block == 0 && slice_offset == 0
    828 						? segment_start + lane_size - 1
    829 						: segment_start + block - 1;
    830 
    831 					u64 index_seed;
    832 					if (constant_time) {
    833 						if (block == pass_offset || (block % 128) == 0) {
    834 							// Fill or refresh deterministic indices block
    835 
    836 							// seed the beginning of the block...
    837 							ZERO(index_block.a, 128);
    838 							index_block.a[0] = pass;
    839 							index_block.a[1] = segment;
    840 							index_block.a[2] = slice;
    841 							index_block.a[3] = nb_blocks;
    842 							index_block.a[4] = config.nb_passes;
    843 							index_block.a[5] = config.algorithm;
    844 							index_block.a[6] = index_ctr;
    845 							index_ctr++;
    846 
    847 							// ... then shuffle it
    848 							copy_block(&tmp, &index_block);
    849 							g_rounds  (&index_block);
    850 							xor_block (&index_block, &tmp);
    851 							copy_block(&tmp, &index_block);
    852 							g_rounds  (&index_block);
    853 							xor_block (&index_block, &tmp);
    854 						}
    855 						index_seed = index_block.a[block % 128];
    856 					} else {
    857 						index_seed = previous->a[0];
    858 					}
    859 
    860 					// Establish the reference set.  *Approximately* comprises:
    861 					// - The last 3 slices (if they exist yet)
    862 					// - The already constructed blocks in the current segment
    863 					u32 next_slice   = ((slice + 1) % 4) * segment_size;
    864 					u32 window_start = pass == 0 ? 0     : next_slice;
    865 					u32 nb_segments  = pass == 0 ? slice : 3;
    866 					u64 lane         =
    867 						pass == 0 && slice == 0
    868 						? segment
    869 						: (index_seed >> 32) % config.nb_lanes;
    870 					u32 window_size  =
    871 						nb_segments * segment_size +
    872 						(lane  == segment ? block-1 :
    873 						 block == 0       ? (u32)-1 : 0);
    874 
    875 					// Find reference block
    876 					u64  j1        = index_seed & 0xffffffff; // block selector
    877 					u64  x         = (j1 * j1)         >> 32;
    878 					u64  y         = (window_size * x) >> 32;
    879 					u64  z         = (window_size - 1) - y;
    880 					u64  ref       = (window_start + z) % lane_size;
    881 					u32  index     = lane * lane_size + (u32)ref;
    882 					blk *reference = blocks + index;
    883 
    884 					// Shuffle the previous & reference block
    885 					// into the current block
    886 					copy_block(&tmp, previous);
    887 					xor_block (&tmp, reference);
    888 					if (pass == 0) { copy_block(current, &tmp); }
    889 					else           { xor_block (current, &tmp); }
    890 					g_rounds  (&tmp);
    891 					xor_block (current, &tmp);
    892 				}
    893 			}
    894 		}
    895 	}
    896 
    897 	// Wipe temporary block
    898 	volatile u64* p = tmp.a;
    899 	ZERO(p, 128);
    900 
    901 	// XOR last blocks of each lane
    902 	blk *last_block = blocks + lane_size - 1;
    903 	FOR_T (u32, lane, 1, config.nb_lanes) {
    904 		blk *next_block = last_block + lane_size;
    905 		xor_block(next_block, last_block);
    906 		last_block = next_block;
    907 	}
    908 
    909 	// Serialize last block
    910 	u8 final_block[1024];
    911 	store64_le_buf(final_block, last_block->a, 128);
    912 
    913 	// Wipe work area
    914 	p = (u64*)work_area;
    915 	ZERO(p, 128 * nb_blocks);
    916 
    917 	// Hash the very last block with H' into the output hash
    918 	extended_hash(hash, hash_size, final_block, 1024);
    919 	WIPE_BUFFER(final_block);
    920 }
    921 
    922 ////////////////////////////////////
    923 /// Arithmetic modulo 2^255 - 19 ///
    924 ////////////////////////////////////
    925 //  Originally taken from SUPERCOP's ref10 implementation.
    926 //  A bit bigger than TweetNaCl, over 4 times faster.
    927 
    928 // field element
    929 typedef i32 fe[10];
    930 
    931 // field constants
    932 //
    933 // fe_one      : 1
    934 // sqrtm1      : sqrt(-1)
    935 // d           :     -121665 / 121666
    936 // D2          : 2 * -121665 / 121666
    937 // lop_x, lop_y: low order point in Edwards coordinates
    938 // ufactor     : -sqrt(-1) * 2
    939 // A2          : 486662^2  (A squared)
    940 static const fe fe_one  = {1};
    941 static const fe sqrtm1  = {
    942 	-32595792, -7943725, 9377950, 3500415, 12389472,
    943 	-272473, -25146209, -2005654, 326686, 11406482,
    944 };
    945 static const fe d       = {
    946 	-10913610, 13857413, -15372611, 6949391, 114729,
    947 	-8787816, -6275908, -3247719, -18696448, -12055116,
    948 };
    949 static const fe D2      = {
    950 	-21827239, -5839606, -30745221, 13898782, 229458,
    951 	15978800, -12551817, -6495438, 29715968, 9444199,
    952 };
    953 static const fe lop_x   = {
    954 	21352778, 5345713, 4660180, -8347857, 24143090,
    955 	14568123, 30185756, -12247770, -33528939, 8345319,
    956 };
    957 static const fe lop_y   = {
    958 	-6952922, -1265500, 6862341, -7057498, -4037696,
    959 	-5447722, 31680899, -15325402, -19365852, 1569102,
    960 };
    961 static const fe ufactor = {
    962 	-1917299, 15887451, -18755900, -7000830, -24778944,
    963 	544946, -16816446, 4011309, -653372, 10741468,
    964 };
    965 static const fe A2      = {
    966 	12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,
    967 };
    968 
    969 static void fe_0(fe h) {           ZERO(h  , 10); }
    970 static void fe_1(fe h) { h[0] = 1; ZERO(h+1,  9); }
    971 
    972 static void fe_copy(fe h,const fe f           ){FOR(i,0,10) h[i] =  f[i];      }
    973 static void fe_neg (fe h,const fe f           ){FOR(i,0,10) h[i] = -f[i];      }
    974 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
    975 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
    976 
    977 static void fe_cswap(fe f, fe g, int b)
    978 {
    979 	i32 mask = -b; // -1 = 0xffffffff
    980 	FOR (i, 0, 10) {
    981 		i32 x = (f[i] ^ g[i]) & mask;
    982 		f[i] = f[i] ^ x;
    983 		g[i] = g[i] ^ x;
    984 	}
    985 }
    986 
    987 static void fe_ccopy(fe f, const fe g, int b)
    988 {
    989 	i32 mask = -b; // -1 = 0xffffffff
    990 	FOR (i, 0, 10) {
    991 		i32 x = (f[i] ^ g[i]) & mask;
    992 		f[i] = f[i] ^ x;
    993 	}
    994 }
    995 
    996 
    997 // Signed carry propagation
    998 // ------------------------
    999 //
   1000 // Let t be a number.  It can be uniquely decomposed thus:
   1001 //
   1002 //    t = h*2^26 + l
   1003 //    such that -2^25 <= l < 2^25
   1004 //
   1005 // Let c = (t + 2^25) / 2^26            (rounded down)
   1006 //     c = (h*2^26 + l + 2^25) / 2^26   (rounded down)
   1007 //     c =  h   +   (l + 2^25) / 2^26   (rounded down)
   1008 //     c =  h                           (exactly)
   1009 // Because 0 <= l + 2^25 < 2^26
   1010 //
   1011 // Let u = t          - c*2^26
   1012 //     u = h*2^26 + l - h*2^26
   1013 //     u = l
   1014 // Therefore, -2^25 <= u < 2^25
   1015 //
   1016 // Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
   1017 //
   1018 // Notations:
   1019 // - In C, 1<<25 means 2^25.
   1020 // - In C, x>>25 means floor(x / (2^25)).
   1021 // - All of the above applies with 25 & 24 as well as 26 & 25.
   1022 //
   1023 //
   1024 // Note on negative right shifts
   1025 // -----------------------------
   1026 //
   1027 // In C, x >> n, where x is a negative integer, is implementation
   1028 // defined.  In practice, all platforms do arithmetic shift, which is
   1029 // equivalent to division by 2^26, rounded down.  Some compilers, like
   1030 // GCC, even guarantee it.
   1031 //
   1032 // If we ever stumble upon a platform that does not propagate the sign
   1033 // bit (we won't), visible failures will show at the slightest test, and
   1034 // the signed shifts can be replaced by the following:
   1035 //
   1036 //     typedef struct { i64 x:39; } s25;
   1037 //     typedef struct { i64 x:38; } s26;
   1038 //     i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
   1039 //     i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
   1040 //
   1041 // Current compilers cannot optimise this, causing a 30% drop in
   1042 // performance.  Fairly expensive for something that never happens.
   1043 //
   1044 //
   1045 // Precondition
   1046 // ------------
   1047 //
   1048 // |t0|       < 2^63
   1049 // |t1|..|t9| < 2^62
   1050 //
   1051 // Algorithm
   1052 // ---------
   1053 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^36
   1054 // t0 -= c * 2^26           -- |t0| <= 2^25
   1055 // t1 += c                  -- |t1| <= 2^63
   1056 //
   1057 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^36
   1058 // t4 -= c * 2^26           -- |t4| <= 2^25
   1059 // t5 += c                  -- |t5| <= 2^63
   1060 //
   1061 // c   = t1 + 2^24 / 2^25   -- |c|  <= 2^38
   1062 // t1 -= c * 2^25           -- |t1| <= 2^24
   1063 // t2 += c                  -- |t2| <= 2^63
   1064 //
   1065 // c   = t5 + 2^24 / 2^25   -- |c|  <= 2^38
   1066 // t5 -= c * 2^25           -- |t5| <= 2^24
   1067 // t6 += c                  -- |t6| <= 2^63
   1068 //
   1069 // c   = t2 + 2^25 / 2^26   -- |c|  <= 2^37
   1070 // t2 -= c * 2^26           -- |t2| <= 2^25        < 1.1 * 2^25  (final t2)
   1071 // t3 += c                  -- |t3| <= 2^63
   1072 //
   1073 // c   = t6 + 2^25 / 2^26   -- |c|  <= 2^37
   1074 // t6 -= c * 2^26           -- |t6| <= 2^25        < 1.1 * 2^25  (final t6)
   1075 // t7 += c                  -- |t7| <= 2^63
   1076 //
   1077 // c   = t3 + 2^24 / 2^25   -- |c|  <= 2^38
   1078 // t3 -= c * 2^25           -- |t3| <= 2^24        < 1.1 * 2^24  (final t3)
   1079 // t4 += c                  -- |t4| <= 2^25 + 2^38 < 2^39
   1080 //
   1081 // c   = t7 + 2^24 / 2^25   -- |c|  <= 2^38
   1082 // t7 -= c * 2^25           -- |t7| <= 2^24        < 1.1 * 2^24  (final t7)
   1083 // t8 += c                  -- |t8| <= 2^63
   1084 //
   1085 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^13
   1086 // t4 -= c * 2^26           -- |t4| <= 2^25        < 1.1 * 2^25  (final t4)
   1087 // t5 += c                  -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24  (final t5)
   1088 //
   1089 // c   = t8 + 2^25 / 2^26   -- |c|  <= 2^37
   1090 // t8 -= c * 2^26           -- |t8| <= 2^25        < 1.1 * 2^25  (final t8)
   1091 // t9 += c                  -- |t9| <= 2^63
   1092 //
   1093 // c   = t9 + 2^24 / 2^25   -- |c|  <= 2^38
   1094 // t9 -= c * 2^25           -- |t9| <= 2^24        < 1.1 * 2^24  (final t9)
   1095 // t0 += c * 19             -- |t0| <= 2^25 + 2^38*19 < 2^44
   1096 //
   1097 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^18
   1098 // t0 -= c * 2^26           -- |t0| <= 2^25        < 1.1 * 2^25  (final t0)
   1099 // t1 += c                  -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24  (final t1)
   1100 //
   1101 // Postcondition
   1102 // -------------
   1103 //   |t0|, |t2|, |t4|, |t6|, |t8|  <  1.1 * 2^25
   1104 //   |t1|, |t3|, |t5|, |t7|, |t9|  <  1.1 * 2^24
   1105 #define FE_CARRY	\
   1106 	i64 c; \
   1107 	c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
   1108 	c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
   1109 	c = (t1 + ((i64)1<<24)) >> 25;  t1 -= c * ((i64)1 << 25);  t2 += c; \
   1110 	c = (t5 + ((i64)1<<24)) >> 25;  t5 -= c * ((i64)1 << 25);  t6 += c; \
   1111 	c = (t2 + ((i64)1<<25)) >> 26;  t2 -= c * ((i64)1 << 26);  t3 += c; \
   1112 	c = (t6 + ((i64)1<<25)) >> 26;  t6 -= c * ((i64)1 << 26);  t7 += c; \
   1113 	c = (t3 + ((i64)1<<24)) >> 25;  t3 -= c * ((i64)1 << 25);  t4 += c; \
   1114 	c = (t7 + ((i64)1<<24)) >> 25;  t7 -= c * ((i64)1 << 25);  t8 += c; \
   1115 	c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
   1116 	c = (t8 + ((i64)1<<25)) >> 26;  t8 -= c * ((i64)1 << 26);  t9 += c; \
   1117 	c = (t9 + ((i64)1<<24)) >> 25;  t9 -= c * ((i64)1 << 25);  t0 += c * 19; \
   1118 	c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
   1119 	h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
   1120 	h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
   1121 
   1122 // Decodes a field element from a byte buffer.
   1123 // mask specifies how many bits we ignore.
   1124 // Traditionally we ignore 1. It's useful for EdDSA,
   1125 // which uses that bit to denote the sign of x.
   1126 // Elligator however uses positive representatives,
   1127 // which means ignoring 2 bits instead.
   1128 static void fe_frombytes_mask(fe h, const u8 s[32], unsigned nb_mask)
   1129 {
   1130 	u32 mask = 0xffffff >> nb_mask;
   1131 	i64 t0 =  load32_le(s);                    // t0 < 2^32
   1132 	i64 t1 =  load24_le(s +  4) << 6;          // t1 < 2^30
   1133 	i64 t2 =  load24_le(s +  7) << 5;          // t2 < 2^29
   1134 	i64 t3 =  load24_le(s + 10) << 3;          // t3 < 2^27
   1135 	i64 t4 =  load24_le(s + 13) << 2;          // t4 < 2^26
   1136 	i64 t5 =  load32_le(s + 16);               // t5 < 2^32
   1137 	i64 t6 =  load24_le(s + 20) << 7;          // t6 < 2^31
   1138 	i64 t7 =  load24_le(s + 23) << 5;          // t7 < 2^29
   1139 	i64 t8 =  load24_le(s + 26) << 4;          // t8 < 2^28
   1140 	i64 t9 = (load24_le(s + 29) & mask) << 2;  // t9 < 2^25
   1141 	FE_CARRY;                                  // Carry precondition OK
   1142 }
   1143 
   1144 static void fe_frombytes(fe h, const u8 s[32])
   1145 {
   1146 	fe_frombytes_mask(h, s, 1);
   1147 }
   1148 
   1149 
   1150 // Precondition
   1151 //   |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]|  <  1.1 * 2^25
   1152 //   |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]|  <  1.1 * 2^24
   1153 //
   1154 // Therefore, |h| < 2^255-19
   1155 // There are two possibilities:
   1156 //
   1157 // - If h is positive, all we need to do is reduce its individual
   1158 //   limbs down to their tight positive range.
   1159 // - If h is negative, we also need to add 2^255-19 to it.
   1160 //   Or just remove 19 and chop off any excess bit.
   1161 static void fe_tobytes(u8 s[32], const fe h)
   1162 {
   1163 	i32 t[10];
   1164 	COPY(t, h, 10);
   1165 	i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
   1166 	//                 |t9|                    < 1.1 * 2^24
   1167 	//  -1.1 * 2^24  <  t9                     < 1.1 * 2^24
   1168 	//  -21  * 2^24  <  19 * t9                < 21  * 2^24
   1169 	//  -2^29        <  19 * t9 + 2^24         < 2^29
   1170 	//  -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
   1171 	//  -16          < (19 * t9 + 2^24) / 2^25 < 16
   1172 	FOR (i, 0, 5) {
   1173 		q += t[2*i  ]; q >>= 26; // q = 0 or -1
   1174 		q += t[2*i+1]; q >>= 25; // q = 0 or -1
   1175 	}
   1176 	// q =  0 iff h >= 0
   1177 	// q = -1 iff h <  0
   1178 	// Adding q * 19 to h reduces h to its proper range.
   1179 	q *= 19;  // Shift carry back to the beginning
   1180 	FOR (i, 0, 5) {
   1181 		t[i*2  ] += q;  q = t[i*2  ] >> 26;  t[i*2  ] -= q * ((i32)1 << 26);
   1182 		t[i*2+1] += q;  q = t[i*2+1] >> 25;  t[i*2+1] -= q * ((i32)1 << 25);
   1183 	}
   1184 	// h is now fully reduced, and q represents the excess bit.
   1185 
   1186 	store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
   1187 	store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
   1188 	store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
   1189 	store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
   1190 	store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
   1191 	store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
   1192 	store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
   1193 	store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
   1194 
   1195 	WIPE_BUFFER(t);
   1196 }
   1197 
   1198 // Precondition
   1199 // -------------
   1200 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
   1201 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
   1202 //
   1203 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
   1204 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
   1205 static void fe_mul_small(fe h, const fe f, i32 g)
   1206 {
   1207 	i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
   1208 	i64 t2 = f[2] * (i64) g;  i64 t3 = f[3] * (i64) g;
   1209 	i64 t4 = f[4] * (i64) g;  i64 t5 = f[5] * (i64) g;
   1210 	i64 t6 = f[6] * (i64) g;  i64 t7 = f[7] * (i64) g;
   1211 	i64 t8 = f[8] * (i64) g;  i64 t9 = f[9] * (i64) g;
   1212 	// |t0|, |t2|, |t4|, |t6|, |t8|  <  1.65 * 2^26 * 2^31  < 2^58
   1213 	// |t1|, |t3|, |t5|, |t7|, |t9|  <  1.65 * 2^25 * 2^31  < 2^57
   1214 
   1215 	FE_CARRY; // Carry precondition OK
   1216 }
   1217 
   1218 // Precondition
   1219 // -------------
   1220 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
   1221 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
   1222 //
   1223 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
   1224 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
   1225 static void fe_mul(fe h, const fe f, const fe g)
   1226 {
   1227 	// Everything is unrolled and put in temporary variables.
   1228 	// We could roll the loop, but that would make curve25519 twice as slow.
   1229 	i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
   1230 	i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
   1231 	i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
   1232 	i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
   1233 	i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
   1234 	i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
   1235 	i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
   1236 	i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
   1237 	// |F1|, |F3|, |F5|, |F7|, |F9|  <  1.65 * 2^26
   1238 	// |G0|, |G2|, |G4|, |G6|, |G8|  <  2^31
   1239 	// |G1|, |G3|, |G5|, |G7|, |G9|  <  2^30
   1240 
   1241 	i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
   1242 	       + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
   1243 	i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
   1244 	       + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
   1245 	i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
   1246 	       + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
   1247 	i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
   1248 	       + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
   1249 	i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
   1250 	       + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
   1251 	i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
   1252 	       + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
   1253 	i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
   1254 	       + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
   1255 	i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
   1256 	       + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
   1257 	i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
   1258 	       + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
   1259 	i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
   1260 	       + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
   1261 	// t0 < 0.67 * 2^61
   1262 	// t1 < 0.41 * 2^61
   1263 	// t2 < 0.52 * 2^61
   1264 	// t3 < 0.32 * 2^61
   1265 	// t4 < 0.38 * 2^61
   1266 	// t5 < 0.22 * 2^61
   1267 	// t6 < 0.23 * 2^61
   1268 	// t7 < 0.13 * 2^61
   1269 	// t8 < 0.09 * 2^61
   1270 	// t9 < 0.03 * 2^61
   1271 
   1272 	FE_CARRY; // Everything below 2^62, Carry precondition OK
   1273 }
   1274 
   1275 // Precondition
   1276 // -------------
   1277 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
   1278 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
   1279 //
   1280 // Note: we could use fe_mul() for this, but this is significantly faster
   1281 static void fe_sq(fe h, const fe f)
   1282 {
   1283 	i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
   1284 	i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
   1285 	i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
   1286 	i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
   1287 	i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
   1288 	i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
   1289 	// |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2|  <  1.65 * 2^27
   1290 	// |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2|  <  1.65 * 2^26
   1291 	// |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| <  2^31
   1292 
   1293 	i64 t0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
   1294 	       + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
   1295 	i64 t1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
   1296 	       + f4  *(i64)f7_38 + f5_2*(i64)f6_19;
   1297 	i64 t2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
   1298 	       + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
   1299 	i64 t3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
   1300 	       + f5_2*(i64)f8_19 + f6  *(i64)f7_38;
   1301 	i64 t4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
   1302 	       + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
   1303 	i64 t5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
   1304 	       + f6  *(i64)f9_38 + f7_2*(i64)f8_19;
   1305 	i64 t6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
   1306 	       + f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
   1307 	i64 t7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
   1308 	       + f3_2*(i64)f4    + f8  *(i64)f9_38;
   1309 	i64 t8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
   1310 	       + f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
   1311 	i64 t9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
   1312 	       + f3_2*(i64)f6    + f4  *(i64)f5_2;
   1313 	// t0 < 0.67 * 2^61
   1314 	// t1 < 0.41 * 2^61
   1315 	// t2 < 0.52 * 2^61
   1316 	// t3 < 0.32 * 2^61
   1317 	// t4 < 0.38 * 2^61
   1318 	// t5 < 0.22 * 2^61
   1319 	// t6 < 0.23 * 2^61
   1320 	// t7 < 0.13 * 2^61
   1321 	// t8 < 0.09 * 2^61
   1322 	// t9 < 0.03 * 2^61
   1323 
   1324 	FE_CARRY;
   1325 }
   1326 
   1327 //  Parity check.  Returns 0 if even, 1 if odd
   1328 static int fe_isodd(const fe f)
   1329 {
   1330 	u8 s[32];
   1331 	fe_tobytes(s, f);
   1332 	u8 isodd = s[0] & 1;
   1333 	WIPE_BUFFER(s);
   1334 	return isodd;
   1335 }
   1336 
   1337 // Returns 1 if equal, 0 if not equal
   1338 static int fe_isequal(const fe f, const fe g)
   1339 {
   1340 	u8 fs[32];
   1341 	u8 gs[32];
   1342 	fe_tobytes(fs, f);
   1343 	fe_tobytes(gs, g);
   1344 	int isdifferent = crypto_verify32(fs, gs);
   1345 	WIPE_BUFFER(fs);
   1346 	WIPE_BUFFER(gs);
   1347 	return 1 + isdifferent;
   1348 }
   1349 
   1350 // Inverse square root.
   1351 // Returns true if x is a square, false otherwise.
   1352 // After the call:
   1353 //   isr = sqrt(1/x)        if x is a non-zero square.
   1354 //   isr = sqrt(sqrt(-1)/x) if x is not a square.
   1355 //   isr = 0                if x is zero.
   1356 // We do not guarantee the sign of the square root.
   1357 //
   1358 // Notes:
   1359 // Let quartic = x^((p-1)/4)
   1360 //
   1361 // x^((p-1)/2) = chi(x)
   1362 // quartic^2   = chi(x)
   1363 // quartic     = sqrt(chi(x))
   1364 // quartic     = 1 or -1 or sqrt(-1) or -sqrt(-1)
   1365 //
   1366 // Note that x is a square if quartic is 1 or -1
   1367 // There are 4 cases to consider:
   1368 //
   1369 // if   quartic         = 1  (x is a square)
   1370 // then x^((p-1)/4)     = 1
   1371 //      x^((p-5)/4) * x = 1
   1372 //      x^((p-5)/4)     = 1/x
   1373 //      x^((p-5)/8)     = sqrt(1/x) or -sqrt(1/x)
   1374 //
   1375 // if   quartic                = -1  (x is a square)
   1376 // then x^((p-1)/4)            = -1
   1377 //      x^((p-5)/4) * x        = -1
   1378 //      x^((p-5)/4)            = -1/x
   1379 //      x^((p-5)/8)            = sqrt(-1)   / sqrt(x)
   1380 //      x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
   1381 //      x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
   1382 //      x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
   1383 //
   1384 // if   quartic         = sqrt(-1)  (x is not a square)
   1385 // then x^((p-1)/4)     = sqrt(-1)
   1386 //      x^((p-5)/4) * x = sqrt(-1)
   1387 //      x^((p-5)/4)     = sqrt(-1)/x
   1388 //      x^((p-5)/8)     = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
   1389 //
   1390 // Note that the product of two non-squares is always a square:
   1391 //   For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
   1392 //   Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
   1393 //   Therefore a*b is a square.
   1394 //
   1395 //   Since sqrt(-1) and x are both non-squares, their product is a
   1396 //   square, and we can compute their square root.
   1397 //
   1398 // if   quartic                = -sqrt(-1)  (x is not a square)
   1399 // then x^((p-1)/4)            = -sqrt(-1)
   1400 //      x^((p-5)/4) * x        = -sqrt(-1)
   1401 //      x^((p-5)/4)            = -sqrt(-1)/x
   1402 //      x^((p-5)/8)            = sqrt(-sqrt(-1)/x)
   1403 //      x^((p-5)/8)            = sqrt( sqrt(-1)/x) * sqrt(-1)
   1404 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
   1405 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
   1406 //      x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
   1407 static int invsqrt(fe isr, const fe x)
   1408 {
   1409 	fe t0, t1, t2;
   1410 
   1411 	// t0 = x^((p-5)/8)
   1412 	// Can be achieved with a simple double & add ladder,
   1413 	// but it would be slower.
   1414 	fe_sq(t0, x);
   1415 	fe_sq(t1,t0);                     fe_sq(t1, t1);    fe_mul(t1, x, t1);
   1416 	fe_mul(t0, t0, t1);
   1417 	fe_sq(t0, t0);                                      fe_mul(t0, t1, t0);
   1418 	fe_sq(t1, t0);  FOR (i, 1,   5) { fe_sq(t1, t1); }  fe_mul(t0, t1, t0);
   1419 	fe_sq(t1, t0);  FOR (i, 1,  10) { fe_sq(t1, t1); }  fe_mul(t1, t1, t0);
   1420 	fe_sq(t2, t1);  FOR (i, 1,  20) { fe_sq(t2, t2); }  fe_mul(t1, t2, t1);
   1421 	fe_sq(t1, t1);  FOR (i, 1,  10) { fe_sq(t1, t1); }  fe_mul(t0, t1, t0);
   1422 	fe_sq(t1, t0);  FOR (i, 1,  50) { fe_sq(t1, t1); }  fe_mul(t1, t1, t0);
   1423 	fe_sq(t2, t1);  FOR (i, 1, 100) { fe_sq(t2, t2); }  fe_mul(t1, t2, t1);
   1424 	fe_sq(t1, t1);  FOR (i, 1,  50) { fe_sq(t1, t1); }  fe_mul(t0, t1, t0);
   1425 	fe_sq(t0, t0);  FOR (i, 1,   2) { fe_sq(t0, t0); }  fe_mul(t0, t0, x);
   1426 
   1427 	// quartic = x^((p-1)/4)
   1428 	i32 *quartic = t1;
   1429 	fe_sq (quartic, t0);
   1430 	fe_mul(quartic, quartic, x);
   1431 
   1432 	i32 *check = t2;
   1433 	fe_0  (check);          int z0 = fe_isequal(x      , check);
   1434 	fe_1  (check);          int p1 = fe_isequal(quartic, check);
   1435 	fe_neg(check, check );  int m1 = fe_isequal(quartic, check);
   1436 	fe_neg(check, sqrtm1);  int ms = fe_isequal(quartic, check);
   1437 
   1438 	// if quartic == -1 or sqrt(-1)
   1439 	// then  isr = x^((p-1)/4) * sqrt(-1)
   1440 	// else  isr = x^((p-1)/4)
   1441 	fe_mul(isr, t0, sqrtm1);
   1442 	fe_ccopy(isr, t0, 1 - (m1 | ms));
   1443 
   1444 	WIPE_BUFFER(t0);
   1445 	WIPE_BUFFER(t1);
   1446 	WIPE_BUFFER(t2);
   1447 	return p1 | m1 | z0;
   1448 }
   1449 
   1450 // Inverse in terms of inverse square root.
   1451 // Requires two additional squarings to get rid of the sign.
   1452 //
   1453 //   1/x = x * (+invsqrt(x^2))^2
   1454 //       = x * (-invsqrt(x^2))^2
   1455 //
   1456 // A fully optimised exponentiation by p-1 would save 6 field
   1457 // multiplications, but it would require more code.
   1458 static void fe_invert(fe out, const fe x)
   1459 {
   1460 	fe tmp;
   1461 	fe_sq(tmp, x);
   1462 	invsqrt(tmp, tmp);
   1463 	fe_sq(tmp, tmp);
   1464 	fe_mul(out, tmp, x);
   1465 	WIPE_BUFFER(tmp);
   1466 }
   1467 
   1468 // trim a scalar for scalar multiplication
   1469 void crypto_eddsa_trim_scalar(u8 out[32], const u8 in[32])
   1470 {
   1471 	COPY(out, in, 32);
   1472 	out[ 0] &= 248;
   1473 	out[31] &= 127;
   1474 	out[31] |= 64;
   1475 }
   1476 
   1477 // get bit from scalar at position i
   1478 static int scalar_bit(const u8 s[32], int i)
   1479 {
   1480 	if (i < 0) { return 0; } // handle -1 for sliding windows
   1481 	return (s[i>>3] >> (i&7)) & 1;
   1482 }
   1483 
   1484 ///////////////
   1485 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
   1486 ///////////////
   1487 static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
   1488                        int nb_bits)
   1489 {
   1490 	// computes the scalar product
   1491 	fe x1;
   1492 	fe_frombytes(x1, p);
   1493 
   1494 	// computes the actual scalar product (the result is in x2 and z2)
   1495 	fe x2, z2, x3, z3, t0, t1;
   1496 	// Montgomery ladder
   1497 	// In projective coordinates, to avoid divisions: x = X / Z
   1498 	// We don't care about the y coordinate, it's only 1 bit of information
   1499 	fe_1(x2);        fe_0(z2); // "zero" point
   1500 	fe_copy(x3, x1); fe_1(z3); // "one"  point
   1501 	int swap = 0;
   1502 	for (int pos = nb_bits-1; pos >= 0; --pos) {
   1503 		// constant time conditional swap before ladder step
   1504 		int b = scalar_bit(scalar, pos);
   1505 		swap ^= b; // xor trick avoids swapping at the end of the loop
   1506 		fe_cswap(x2, x3, swap);
   1507 		fe_cswap(z2, z3, swap);
   1508 		swap = b;  // anticipates one last swap after the loop
   1509 
   1510 		// Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
   1511 		// with differential addition
   1512 		fe_sub(t0, x3, z3);
   1513 		fe_sub(t1, x2, z2);
   1514 		fe_add(x2, x2, z2);
   1515 		fe_add(z2, x3, z3);
   1516 		fe_mul(z3, t0, x2);
   1517 		fe_mul(z2, z2, t1);
   1518 		fe_sq (t0, t1    );
   1519 		fe_sq (t1, x2    );
   1520 		fe_add(x3, z3, z2);
   1521 		fe_sub(z2, z3, z2);
   1522 		fe_mul(x2, t1, t0);
   1523 		fe_sub(t1, t1, t0);
   1524 		fe_sq (z2, z2    );
   1525 		fe_mul_small(z3, t1, 121666);
   1526 		fe_sq (x3, x3    );
   1527 		fe_add(t0, t0, z3);
   1528 		fe_mul(z3, x1, z2);
   1529 		fe_mul(z2, t1, t0);
   1530 	}
   1531 	// last swap is necessary to compensate for the xor trick
   1532 	// Note: after this swap, P3 == P2 + P1.
   1533 	fe_cswap(x2, x3, swap);
   1534 	fe_cswap(z2, z3, swap);
   1535 
   1536 	// normalises the coordinates: x == X / Z
   1537 	fe_invert(z2, z2);
   1538 	fe_mul(x2, x2, z2);
   1539 	fe_tobytes(q, x2);
   1540 
   1541 	WIPE_BUFFER(x1);
   1542 	WIPE_BUFFER(x2);  WIPE_BUFFER(z2);  WIPE_BUFFER(t0);
   1543 	WIPE_BUFFER(x3);  WIPE_BUFFER(z3);  WIPE_BUFFER(t1);
   1544 }
   1545 
   1546 void crypto_x25519(u8       raw_shared_secret[32],
   1547                    const u8 your_secret_key  [32],
   1548                    const u8 their_public_key [32])
   1549 {
   1550 	// restrict the possible scalar values
   1551 	u8 e[32];
   1552 	crypto_eddsa_trim_scalar(e, your_secret_key);
   1553 	scalarmult(raw_shared_secret, e, their_public_key, 255);
   1554 	WIPE_BUFFER(e);
   1555 }
   1556 
   1557 void crypto_x25519_public_key(u8       public_key[32],
   1558                               const u8 secret_key[32])
   1559 {
   1560 	static const u8 base_point[32] = {9};
   1561 	crypto_x25519(public_key, secret_key, base_point);
   1562 }
   1563 
   1564 ///////////////////////////
   1565 /// Arithmetic modulo L ///
   1566 ///////////////////////////
   1567 static const u32 L[8] = {
   1568 	0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
   1569 	0x00000000, 0x00000000, 0x00000000, 0x10000000,
   1570 };
   1571 
   1572 //  p = a*b + p
   1573 static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
   1574 {
   1575 	FOR (i, 0, 8) {
   1576 		u64 carry = 0;
   1577 		FOR (j, 0, 8) {
   1578 			carry  += p[i+j] + (u64)a[i] * b[j];
   1579 			p[i+j]  = (u32)carry;
   1580 			carry >>= 32;
   1581 		}
   1582 		p[i+8] = (u32)carry;
   1583 	}
   1584 }
   1585 
   1586 static int is_above_l(const u32 x[8])
   1587 {
   1588 	// We work with L directly, in a 2's complement encoding
   1589 	// (-L == ~L + 1)
   1590 	u64 carry = 1;
   1591 	FOR (i, 0, 8) {
   1592 		carry  += (u64)x[i] + (~L[i] & 0xffffffff);
   1593 		carry >>= 32;
   1594 	}
   1595 	return (int)carry; // carry is either 0 or 1
   1596 }
   1597 
   1598 // Final reduction modulo L, by conditionally removing L.
   1599 // if x < l     , then r = x
   1600 // if l <= x 2*l, then r = x-l
   1601 // otherwise the result will be wrong
   1602 static void remove_l(u32 r[8], const u32 x[8])
   1603 {
   1604 	u64 carry = (u64)is_above_l(x);
   1605 	u32 mask  = ~(u32)carry + 1; // carry == 0 or 1
   1606 	FOR (i, 0, 8) {
   1607 		carry += (u64)x[i] + (~L[i] & mask);
   1608 		r[i]   = (u32)carry;
   1609 		carry >>= 32;
   1610 	}
   1611 }
   1612 
   1613 // Full reduction modulo L (Barrett reduction)
   1614 static void mod_l(u8 reduced[32], const u32 x[16])
   1615 {
   1616 	static const u32 r[9] = {
   1617 		0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
   1618 		0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,
   1619 	};
   1620 	// xr = x * r
   1621 	u32 xr[25] = {0};
   1622 	FOR (i, 0, 9) {
   1623 		u64 carry = 0;
   1624 		FOR (j, 0, 16) {
   1625 			carry  += xr[i+j] + (u64)r[i] * x[j];
   1626 			xr[i+j] = (u32)carry;
   1627 			carry >>= 32;
   1628 		}
   1629 		xr[i+16] = (u32)carry;
   1630 	}
   1631 	// xr = floor(xr / 2^512) * L
   1632 	// Since the result is guaranteed to be below 2*L,
   1633 	// it is enough to only compute the first 256 bits.
   1634 	// The division is performed by saying xr[i+16]. (16 * 32 = 512)
   1635 	ZERO(xr, 8);
   1636 	FOR (i, 0, 8) {
   1637 		u64 carry = 0;
   1638 		FOR (j, 0, 8-i) {
   1639 			carry   += xr[i+j] + (u64)xr[i+16] * L[j];
   1640 			xr[i+j] = (u32)carry;
   1641 			carry >>= 32;
   1642 		}
   1643 	}
   1644 	// xr = x - xr
   1645 	u64 carry = 1;
   1646 	FOR (i, 0, 8) {
   1647 		carry  += (u64)x[i] + (~xr[i] & 0xffffffff);
   1648 		xr[i]   = (u32)carry;
   1649 		carry >>= 32;
   1650 	}
   1651 	// Final reduction modulo L (conditional subtraction)
   1652 	remove_l(xr, xr);
   1653 	store32_le_buf(reduced, xr, 8);
   1654 
   1655 	WIPE_BUFFER(xr);
   1656 }
   1657 
   1658 void crypto_eddsa_reduce(u8 reduced[32], const u8 expanded[64])
   1659 {
   1660 	u32 x[16];
   1661 	load32_le_buf(x, expanded, 16);
   1662 	mod_l(reduced, x);
   1663 	WIPE_BUFFER(x);
   1664 }
   1665 
   1666 // r = (a * b) + c
   1667 void crypto_eddsa_mul_add(u8 r[32],
   1668                           const u8 a[32], const u8 b[32], const u8 c[32])
   1669 {
   1670 	u32 A[8];  load32_le_buf(A, a, 8);
   1671 	u32 B[8];  load32_le_buf(B, b, 8);
   1672 	u32 p[16]; load32_le_buf(p, c, 8);  ZERO(p + 8, 8);
   1673 	multiply(p, A, B);
   1674 	mod_l(r, p);
   1675 	WIPE_BUFFER(p);
   1676 	WIPE_BUFFER(A);
   1677 	WIPE_BUFFER(B);
   1678 }
   1679 
   1680 ///////////////
   1681 /// Ed25519 ///
   1682 ///////////////
   1683 
   1684 // Point (group element, ge) in a twisted Edwards curve,
   1685 // in extended projective coordinates.
   1686 // ge        : x  = X/Z, y  = Y/Z, T  = XY/Z
   1687 // ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
   1688 // ge_precomp: Z  = 1
   1689 typedef struct { fe X;  fe Y;  fe Z; fe T;  } ge;
   1690 typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
   1691 typedef struct { fe Yp; fe Ym;       fe T2; } ge_precomp;
   1692 
   1693 static void ge_zero(ge *p)
   1694 {
   1695 	fe_0(p->X);
   1696 	fe_1(p->Y);
   1697 	fe_1(p->Z);
   1698 	fe_0(p->T);
   1699 }
   1700 
   1701 static void ge_tobytes(u8 s[32], const ge *h)
   1702 {
   1703 	fe recip, x, y;
   1704 	fe_invert(recip, h->Z);
   1705 	fe_mul(x, h->X, recip);
   1706 	fe_mul(y, h->Y, recip);
   1707 	fe_tobytes(s, y);
   1708 	s[31] ^= fe_isodd(x) << 7;
   1709 
   1710 	WIPE_BUFFER(recip);
   1711 	WIPE_BUFFER(x);
   1712 	WIPE_BUFFER(y);
   1713 }
   1714 
   1715 // h = -s, where s is a point encoded in 32 bytes
   1716 //
   1717 // Variable time!  Inputs must not be secret!
   1718 // => Use only to *check* signatures.
   1719 //
   1720 // From the specifications:
   1721 //   The encoding of s contains y and the sign of x
   1722 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
   1723 // In extended coordinates:
   1724 //   X = x, Y = y, Z = 1, T = x*y
   1725 //
   1726 //    Note that num * den is a square iff num / den is a square
   1727 //    If num * den is not a square, the point was not on the curve.
   1728 // From the above:
   1729 //   Let num =   y^2 - 1
   1730 //   Let den = d*y^2 + 1
   1731 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
   1732 //   x = sqrt(num / den)
   1733 //   x = sqrt(num^2 / (num * den))
   1734 //   x = num * sqrt(1 / (num * den))
   1735 //
   1736 // Therefore, we can just compute:
   1737 //   num =   y^2 - 1
   1738 //   den = d*y^2 + 1
   1739 //   isr = invsqrt(num * den)  // abort if not square
   1740 //   x   = num * isr
   1741 // Finally, negate x if its sign is not as specified.
   1742 static int ge_frombytes_neg_vartime(ge *h, const u8 s[32])
   1743 {
   1744 	fe_frombytes(h->Y, s);
   1745 	fe_1(h->Z);
   1746 	fe_sq (h->T, h->Y);        // t =   y^2
   1747 	fe_mul(h->X, h->T, d   );  // x = d*y^2
   1748 	fe_sub(h->T, h->T, h->Z);  // t =   y^2 - 1
   1749 	fe_add(h->X, h->X, h->Z);  // x = d*y^2 + 1
   1750 	fe_mul(h->X, h->T, h->X);  // x = (y^2 - 1) * (d*y^2 + 1)
   1751 	int is_square = invsqrt(h->X, h->X);
   1752 	if (!is_square) {
   1753 		return -1;             // Not on the curve, abort
   1754 	}
   1755 	fe_mul(h->X, h->T, h->X);  // x = sqrt((y^2 - 1) / (d*y^2 + 1))
   1756 	if (fe_isodd(h->X) == (s[31] >> 7)) {
   1757 		fe_neg(h->X, h->X);
   1758 	}
   1759 	fe_mul(h->T, h->X, h->Y);
   1760 	return 0;
   1761 }
   1762 
   1763 static void ge_cache(ge_cached *c, const ge *p)
   1764 {
   1765 	fe_add (c->Yp, p->Y, p->X);
   1766 	fe_sub (c->Ym, p->Y, p->X);
   1767 	fe_copy(c->Z , p->Z      );
   1768 	fe_mul (c->T2, p->T, D2  );
   1769 }
   1770 
   1771 // Internal buffers are not wiped! Inputs must not be secret!
   1772 // => Use only to *check* signatures.
   1773 static void ge_add(ge *s, const ge *p, const ge_cached *q)
   1774 {
   1775 	fe a, b;
   1776 	fe_add(a   , p->Y, p->X );
   1777 	fe_sub(b   , p->Y, p->X );
   1778 	fe_mul(a   , a   , q->Yp);
   1779 	fe_mul(b   , b   , q->Ym);
   1780 	fe_add(s->Y, a   , b    );
   1781 	fe_sub(s->X, a   , b    );
   1782 
   1783 	fe_add(s->Z, p->Z, p->Z );
   1784 	fe_mul(s->Z, s->Z, q->Z );
   1785 	fe_mul(s->T, p->T, q->T2);
   1786 	fe_add(a   , s->Z, s->T );
   1787 	fe_sub(b   , s->Z, s->T );
   1788 
   1789 	fe_mul(s->T, s->X, s->Y);
   1790 	fe_mul(s->X, s->X, b   );
   1791 	fe_mul(s->Y, s->Y, a   );
   1792 	fe_mul(s->Z, a   , b   );
   1793 }
   1794 
   1795 // Internal buffers are not wiped! Inputs must not be secret!
   1796 // => Use only to *check* signatures.
   1797 static void ge_sub(ge *s, const ge *p, const ge_cached *q)
   1798 {
   1799 	ge_cached neg;
   1800 	fe_copy(neg.Ym, q->Yp);
   1801 	fe_copy(neg.Yp, q->Ym);
   1802 	fe_copy(neg.Z , q->Z );
   1803 	fe_neg (neg.T2, q->T2);
   1804 	ge_add(s, p, &neg);
   1805 }
   1806 
   1807 static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
   1808 {
   1809 	fe_add(a   , p->Y, p->X );
   1810 	fe_sub(b   , p->Y, p->X );
   1811 	fe_mul(a   , a   , q->Yp);
   1812 	fe_mul(b   , b   , q->Ym);
   1813 	fe_add(s->Y, a   , b    );
   1814 	fe_sub(s->X, a   , b    );
   1815 
   1816 	fe_add(s->Z, p->Z, p->Z );
   1817 	fe_mul(s->T, p->T, q->T2);
   1818 	fe_add(a   , s->Z, s->T );
   1819 	fe_sub(b   , s->Z, s->T );
   1820 
   1821 	fe_mul(s->T, s->X, s->Y);
   1822 	fe_mul(s->X, s->X, b   );
   1823 	fe_mul(s->Y, s->Y, a   );
   1824 	fe_mul(s->Z, a   , b   );
   1825 }
   1826 
   1827 // Internal buffers are not wiped! Inputs must not be secret!
   1828 // => Use only to *check* signatures.
   1829 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
   1830 {
   1831 	ge_precomp neg;
   1832 	fe_copy(neg.Ym, q->Yp);
   1833 	fe_copy(neg.Yp, q->Ym);
   1834 	fe_neg (neg.T2, q->T2);
   1835 	ge_madd(s, p, &neg, a, b);
   1836 }
   1837 
   1838 static void ge_double(ge *s, const ge *p, ge *q)
   1839 {
   1840 	fe_sq (q->X, p->X);
   1841 	fe_sq (q->Y, p->Y);
   1842 	fe_sq (q->Z, p->Z);          // qZ = pZ^2
   1843 	fe_mul_small(q->Z, q->Z, 2); // qZ = pZ^2 * 2
   1844 	fe_add(q->T, p->X, p->Y);
   1845 	fe_sq (s->T, q->T);
   1846 	fe_add(q->T, q->Y, q->X);
   1847 	fe_sub(q->Y, q->Y, q->X);
   1848 	fe_sub(q->X, s->T, q->T);
   1849 	fe_sub(q->Z, q->Z, q->Y);
   1850 
   1851 	fe_mul(s->X, q->X , q->Z);
   1852 	fe_mul(s->Y, q->T , q->Y);
   1853 	fe_mul(s->Z, q->Y , q->Z);
   1854 	fe_mul(s->T, q->X , q->T);
   1855 }
   1856 
   1857 // 5-bit signed window in cached format (Niels coordinates, Z=1)
   1858 static const ge_precomp b_window[8] = {
   1859 	{{25967493,-14356035,29566456,3660896,-12694345,
   1860 	  4014787,27544626,-11754271,-6079156,2047605,},
   1861 	 {-12545711,934262,-2722910,3049990,-727428,
   1862 	  9406986,12720692,5043384,19500929,-15469378,},
   1863 	 {-8738181,4489570,9688441,-14785194,10184609,
   1864 	  -12363380,29287919,11864899,-24514362,-4438546,},},
   1865 	{{15636291,-9688557,24204773,-7912398,616977,
   1866 	  -16685262,27787600,-14772189,28944400,-1550024,},
   1867 	 {16568933,4717097,-11556148,-1102322,15682896,
   1868 	  -11807043,16354577,-11775962,7689662,11199574,},
   1869 	 {30464156,-5976125,-11779434,-15670865,23220365,
   1870 	  15915852,7512774,10017326,-17749093,-9920357,},},
   1871 	{{10861363,11473154,27284546,1981175,-30064349,
   1872 	  12577861,32867885,14515107,-15438304,10819380,},
   1873 	 {4708026,6336745,20377586,9066809,-11272109,
   1874 	  6594696,-25653668,12483688,-12668491,5581306,},
   1875 	 {19563160,16186464,-29386857,4097519,10237984,
   1876 	  -4348115,28542350,13850243,-23678021,-15815942,},},
   1877 	{{5153746,9909285,1723747,-2777874,30523605,
   1878 	  5516873,19480852,5230134,-23952439,-15175766,},
   1879 	 {-30269007,-3463509,7665486,10083793,28475525,
   1880 	  1649722,20654025,16520125,30598449,7715701,},
   1881 	 {28881845,14381568,9657904,3680757,-20181635,
   1882 	  7843316,-31400660,1370708,29794553,-1409300,},},
   1883 	{{-22518993,-6692182,14201702,-8745502,-23510406,
   1884 	  8844726,18474211,-1361450,-13062696,13821877,},
   1885 	 {-6455177,-7839871,3374702,-4740862,-27098617,
   1886 	  -10571707,31655028,-7212327,18853322,-14220951,},
   1887 	 {4566830,-12963868,-28974889,-12240689,-7602672,
   1888 	  -2830569,-8514358,-10431137,2207753,-3209784,},},
   1889 	{{-25154831,-4185821,29681144,7868801,-6854661,
   1890 	  -9423865,-12437364,-663000,-31111463,-16132436,},
   1891 	 {25576264,-2703214,7349804,-11814844,16472782,
   1892 	  9300885,3844789,15725684,171356,6466918,},
   1893 	 {23103977,13316479,9739013,-16149481,817875,
   1894 	  -15038942,8965339,-14088058,-30714912,16193877,},},
   1895 	{{-33521811,3180713,-2394130,14003687,-16903474,
   1896 	  -16270840,17238398,4729455,-18074513,9256800,},
   1897 	 {-25182317,-4174131,32336398,5036987,-21236817,
   1898 	  11360617,22616405,9761698,-19827198,630305,},
   1899 	 {-13720693,2639453,-24237460,-7406481,9494427,
   1900 	  -5774029,-6554551,-15960994,-2449256,-14291300,},},
   1901 	{{-3151181,-5046075,9282714,6866145,-31907062,
   1902 	  -863023,-18940575,15033784,25105118,-7894876,},
   1903 	 {-24326370,15950226,-31801215,-14592823,-11662737,
   1904 	  -5090925,1573892,-2625887,2198790,-15804619,},
   1905 	 {-3099351,10324967,-2241613,7453183,-5446979,
   1906 	  -2735503,-13812022,-16236442,-32461234,-12290683,},},
   1907 };
   1908 
   1909 // Incremental sliding windows (left to right)
   1910 // Based on Roberto Maria Avanzi[2005]
   1911 typedef struct {
   1912 	i16 next_index; // position of the next signed digit
   1913 	i8  next_digit; // next signed digit (odd number below 2^window_width)
   1914 	u8  next_check; // point at which we must check for a new window
   1915 } slide_ctx;
   1916 
   1917 static void slide_init(slide_ctx *ctx, const u8 scalar[32])
   1918 {
   1919 	// scalar is guaranteed to be below L, either because we checked (s),
   1920 	// or because we reduced it modulo L (h_ram). L is under 2^253, so
   1921 	// so bits 253 to 255 are guaranteed to be zero. No need to test them.
   1922 	//
   1923 	// Note however that L is very close to 2^252, so bit 252 is almost
   1924 	// always zero.  If we were to start at bit 251, the tests wouldn't
   1925 	// catch the off-by-one error (constructing one that does would be
   1926 	// prohibitively expensive).
   1927 	//
   1928 	// We should still check bit 252, though.
   1929 	int i = 252;
   1930 	while (i > 0 && scalar_bit(scalar, i) == 0) {
   1931 		i--;
   1932 	}
   1933 	ctx->next_check = (u8)(i + 1);
   1934 	ctx->next_index = -1;
   1935 	ctx->next_digit = -1;
   1936 }
   1937 
   1938 static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
   1939 {
   1940 	if (i == ctx->next_check) {
   1941 		if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
   1942 			ctx->next_check--;
   1943 		} else {
   1944 			// compute digit of next window
   1945 			int w = MIN(width, i + 1);
   1946 			int v = -(scalar_bit(scalar, i) << (w-1));
   1947 			FOR_T (int, j, 0, w-1) {
   1948 				v += scalar_bit(scalar, i-(w-1)+j) << j;
   1949 			}
   1950 			v += scalar_bit(scalar, i-w);
   1951 			int lsb = v & (~v + 1); // smallest bit of v
   1952 			int s   =               // log2(lsb)
   1953 				(((lsb & 0xAA) != 0) << 0) |
   1954 				(((lsb & 0xCC) != 0) << 1) |
   1955 				(((lsb & 0xF0) != 0) << 2);
   1956 			ctx->next_index  = (i16)(i-(w-1)+s);
   1957 			ctx->next_digit  = (i8) (v >> s   );
   1958 			ctx->next_check -= (u8) w;
   1959 		}
   1960 	}
   1961 	return i == ctx->next_index ? ctx->next_digit: 0;
   1962 }
   1963 
   1964 #define P_W_WIDTH 3 // Affects the size of the stack
   1965 #define B_W_WIDTH 5 // Affects the size of the binary
   1966 #define P_W_SIZE  (1<<(P_W_WIDTH-2))
   1967 
   1968 int crypto_eddsa_check_equation(const u8 signature[64], const u8 public_key[32],
   1969                                 const u8 h[32])
   1970 {
   1971 	ge minus_A; // -public_key
   1972 	ge minus_R; // -first_half_of_signature
   1973 	const u8 *s = signature + 32;
   1974 
   1975 	// Check that A and R are on the curve
   1976 	// Check that 0 <= S < L (prevents malleability)
   1977 	// *Allow* non-cannonical encoding for A and R
   1978 	{
   1979 		u32 s32[8];
   1980 		load32_le_buf(s32, s, 8);
   1981 		if (ge_frombytes_neg_vartime(&minus_A, public_key) ||
   1982 		    ge_frombytes_neg_vartime(&minus_R, signature)  ||
   1983 		    is_above_l(s32)) {
   1984 			return -1;
   1985 		}
   1986 	}
   1987 
   1988 	// look-up table for minus_A
   1989 	ge_cached lutA[P_W_SIZE];
   1990 	{
   1991 		ge minus_A2, tmp;
   1992 		ge_double(&minus_A2, &minus_A, &tmp);
   1993 		ge_cache(&lutA[0], &minus_A);
   1994 		FOR (i, 1, P_W_SIZE) {
   1995 			ge_add(&tmp, &minus_A2, &lutA[i-1]);
   1996 			ge_cache(&lutA[i], &tmp);
   1997 		}
   1998 	}
   1999 
   2000 	// sum = [s]B - [h]A
   2001 	// Merged double and add ladder, fused with sliding
   2002 	slide_ctx h_slide;  slide_init(&h_slide, h);
   2003 	slide_ctx s_slide;  slide_init(&s_slide, s);
   2004 	int i = MAX(h_slide.next_check, s_slide.next_check);
   2005 	ge *sum = &minus_A; // reuse minus_A for the sum
   2006 	ge_zero(sum);
   2007 	while (i >= 0) {
   2008 		ge tmp;
   2009 		ge_double(sum, sum, &tmp);
   2010 		int h_digit = slide_step(&h_slide, P_W_WIDTH, i, h);
   2011 		int s_digit = slide_step(&s_slide, B_W_WIDTH, i, s);
   2012 		if (h_digit > 0) { ge_add(sum, sum, &lutA[ h_digit / 2]); }
   2013 		if (h_digit < 0) { ge_sub(sum, sum, &lutA[-h_digit / 2]); }
   2014 		fe t1, t2;
   2015 		if (s_digit > 0) { ge_madd(sum, sum, b_window +  s_digit/2, t1, t2); }
   2016 		if (s_digit < 0) { ge_msub(sum, sum, b_window + -s_digit/2, t1, t2); }
   2017 		i--;
   2018 	}
   2019 
   2020 	// Compare [8](sum-R) and the zero point
   2021 	// The multiplication by 8 eliminates any low-order component
   2022 	// and ensures consistency with batched verification.
   2023 	ge_cached cached;
   2024 	u8 check[32];
   2025 	static const u8 zero_point[32] = {1}; // Point of order 1
   2026 	ge_cache(&cached, &minus_R);
   2027 	ge_add(sum, sum, &cached);
   2028 	ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
   2029 	ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
   2030 	ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
   2031 	ge_tobytes(check, sum);
   2032 	return crypto_verify32(check, zero_point);
   2033 }
   2034 
   2035 // 5-bit signed comb in cached format (Niels coordinates, Z=1)
   2036 static const ge_precomp b_comb_low[8] = {
   2037 	{{-6816601,-2324159,-22559413,124364,18015490,
   2038 	  8373481,19993724,1979872,-18549925,9085059,},
   2039 	 {10306321,403248,14839893,9633706,8463310,
   2040 	  -8354981,-14305673,14668847,26301366,2818560,},
   2041 	 {-22701500,-3210264,-13831292,-2927732,-16326337,
   2042 	  -14016360,12940910,177905,12165515,-2397893,},},
   2043 	{{-12282262,-7022066,9920413,-3064358,-32147467,
   2044 	  2927790,22392436,-14852487,2719975,16402117,},
   2045 	 {-7236961,-4729776,2685954,-6525055,-24242706,
   2046 	  -15940211,-6238521,14082855,10047669,12228189,},
   2047 	 {-30495588,-12893761,-11161261,3539405,-11502464,
   2048 	  16491580,-27286798,-15030530,-7272871,-15934455,},},
   2049 	{{17650926,582297,-860412,-187745,-12072900,
   2050 	  -10683391,-20352381,15557840,-31072141,-5019061,},
   2051 	 {-6283632,-2259834,-4674247,-4598977,-4089240,
   2052 	  12435688,-31278303,1060251,6256175,10480726,},
   2053 	 {-13871026,2026300,-21928428,-2741605,-2406664,
   2054 	  -8034988,7355518,15733500,-23379862,7489131,},},
   2055 	{{6883359,695140,23196907,9644202,-33430614,
   2056 	  11354760,-20134606,6388313,-8263585,-8491918,},
   2057 	 {-7716174,-13605463,-13646110,14757414,-19430591,
   2058 	  -14967316,10359532,-11059670,-21935259,12082603,},
   2059 	 {-11253345,-15943946,10046784,5414629,24840771,
   2060 	  8086951,-6694742,9868723,15842692,-16224787,},},
   2061 	{{9639399,11810955,-24007778,-9320054,3912937,
   2062 	  -9856959,996125,-8727907,-8919186,-14097242,},
   2063 	 {7248867,14468564,25228636,-8795035,14346339,
   2064 	  8224790,6388427,-7181107,6468218,-8720783,},
   2065 	 {15513115,15439095,7342322,-10157390,18005294,
   2066 	  -7265713,2186239,4884640,10826567,7135781,},},
   2067 	{{-14204238,5297536,-5862318,-6004934,28095835,
   2068 	  4236101,-14203318,1958636,-16816875,3837147,},
   2069 	 {-5511166,-13176782,-29588215,12339465,15325758,
   2070 	  -15945770,-8813185,11075932,-19608050,-3776283,},
   2071 	 {11728032,9603156,-4637821,-5304487,-7827751,
   2072 	  2724948,31236191,-16760175,-7268616,14799772,},},
   2073 	{{-28842672,4840636,-12047946,-9101456,-1445464,
   2074 	  381905,-30977094,-16523389,1290540,12798615,},
   2075 	 {27246947,-10320914,14792098,-14518944,5302070,
   2076 	  -8746152,-3403974,-4149637,-27061213,10749585,},
   2077 	 {25572375,-6270368,-15353037,16037944,1146292,
   2078 	  32198,23487090,9585613,24714571,-1418265,},},
   2079 	{{19844825,282124,-17583147,11004019,-32004269,
   2080 	  -2716035,6105106,-1711007,-21010044,14338445,},
   2081 	 {8027505,8191102,-18504907,-12335737,25173494,
   2082 	  -5923905,15446145,7483684,-30440441,10009108,},
   2083 	 {-14134701,-4174411,10246585,-14677495,33553567,
   2084 	  -14012935,23366126,15080531,-7969992,7663473,},},
   2085 };
   2086 
   2087 static const ge_precomp b_comb_high[8] = {
   2088 	{{33055887,-4431773,-521787,6654165,951411,
   2089 	  -6266464,-5158124,6995613,-5397442,-6985227,},
   2090 	 {4014062,6967095,-11977872,3960002,8001989,
   2091 	  5130302,-2154812,-1899602,-31954493,-16173976,},
   2092 	 {16271757,-9212948,23792794,731486,-25808309,
   2093 	  -3546396,6964344,-4767590,10976593,10050757,},},
   2094 	{{2533007,-4288439,-24467768,-12387405,-13450051,
   2095 	  14542280,12876301,13893535,15067764,8594792,},
   2096 	 {20073501,-11623621,3165391,-13119866,13188608,
   2097 	  -11540496,-10751437,-13482671,29588810,2197295,},
   2098 	 {-1084082,11831693,6031797,14062724,14748428,
   2099 	  -8159962,-20721760,11742548,31368706,13161200,},},
   2100 	{{2050412,-6457589,15321215,5273360,25484180,
   2101 	  124590,-18187548,-7097255,-6691621,-14604792,},
   2102 	 {9938196,2162889,-6158074,-1711248,4278932,
   2103 	  -2598531,-22865792,-7168500,-24323168,11746309,},
   2104 	 {-22691768,-14268164,5965485,9383325,20443693,
   2105 	  5854192,28250679,-1381811,-10837134,13717818,},},
   2106 	{{-8495530,16382250,9548884,-4971523,-4491811,
   2107 	  -3902147,6182256,-12832479,26628081,10395408,},
   2108 	 {27329048,-15853735,7715764,8717446,-9215518,
   2109 	  -14633480,28982250,-5668414,4227628,242148,},
   2110 	 {-13279943,-7986904,-7100016,8764468,-27276630,
   2111 	  3096719,29678419,-9141299,3906709,11265498,},},
   2112 	{{11918285,15686328,-17757323,-11217300,-27548967,
   2113 	  4853165,-27168827,6807359,6871949,-1075745,},
   2114 	 {-29002610,13984323,-27111812,-2713442,28107359,
   2115 	  -13266203,6155126,15104658,3538727,-7513788,},
   2116 	 {14103158,11233913,-33165269,9279850,31014152,
   2117 	  4335090,-1827936,4590951,13960841,12787712,},},
   2118 	{{1469134,-16738009,33411928,13942824,8092558,
   2119 	  -8778224,-11165065,1437842,22521552,-2792954,},
   2120 	 {31352705,-4807352,-25327300,3962447,12541566,
   2121 	  -9399651,-27425693,7964818,-23829869,5541287,},
   2122 	 {-25732021,-6864887,23848984,3039395,-9147354,
   2123 	  6022816,-27421653,10590137,25309915,-1584678,},},
   2124 	{{-22951376,5048948,31139401,-190316,-19542447,
   2125 	  -626310,-17486305,-16511925,-18851313,-12985140,},
   2126 	 {-9684890,14681754,30487568,7717771,-10829709,
   2127 	  9630497,30290549,-10531496,-27798994,-13812825,},
   2128 	 {5827835,16097107,-24501327,12094619,7413972,
   2129 	  11447087,28057551,-1793987,-14056981,4359312,},},
   2130 	{{26323183,2342588,-21887793,-1623758,-6062284,
   2131 	  2107090,-28724907,9036464,-19618351,-13055189,},
   2132 	 {-29697200,14829398,-4596333,14220089,-30022969,
   2133 	  2955645,12094100,-13693652,-5941445,7047569,},
   2134 	 {-3201977,14413268,-12058324,-16417589,-9035655,
   2135 	  -7224648,9258160,1399236,30397584,-5684634,},},
   2136 };
   2137 
   2138 static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
   2139                        const ge_precomp comb[8], const u8 scalar[32], int i)
   2140 {
   2141 	u8 teeth = (u8)((scalar_bit(scalar, i)          ) +
   2142 	                (scalar_bit(scalar, i + 32) << 1) +
   2143 	                (scalar_bit(scalar, i + 64) << 2) +
   2144 	                (scalar_bit(scalar, i + 96) << 3));
   2145 	u8 high  = teeth >> 3;
   2146 	u8 index = (teeth ^ (high - 1)) & 7;
   2147 	FOR (j, 0, 8) {
   2148 		i32 select = 1 & (((j ^ index) - 1) >> 8);
   2149 		fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
   2150 		fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
   2151 		fe_ccopy(tmp_c->T2, comb[j].T2, select);
   2152 	}
   2153 	fe_neg(tmp_a, tmp_c->T2);
   2154 	fe_cswap(tmp_c->T2, tmp_a    , high ^ 1);
   2155 	fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
   2156 	ge_madd(p, p, tmp_c, tmp_a, tmp_b);
   2157 }
   2158 
   2159 // p = [scalar]B, where B is the base point
   2160 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
   2161 {
   2162 	// twin 4-bits signed combs, from Mike Hamburg's
   2163 	// Fast and compact elliptic-curve cryptography (2012)
   2164 	// 1 / 2 modulo L
   2165 	static const u8 half_mod_L[32] = {
   2166 		247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
   2167 		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,
   2168 	};
   2169 	// (2^256 - 1) / 2 modulo L
   2170 	static const u8 half_ones[32] = {
   2171 		142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
   2172 		255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7,
   2173 	};
   2174 
   2175 	// All bits set form: 1 means 1, 0 means -1
   2176 	u8 s_scalar[32];
   2177 	crypto_eddsa_mul_add(s_scalar, scalar, half_mod_L, half_ones);
   2178 
   2179 	// Double and add ladder
   2180 	fe tmp_a, tmp_b;  // temporaries for addition
   2181 	ge_precomp tmp_c; // temporary for comb lookup
   2182 	ge tmp_d;         // temporary for doubling
   2183 	fe_1(tmp_c.Yp);
   2184 	fe_1(tmp_c.Ym);
   2185 	fe_0(tmp_c.T2);
   2186 
   2187 	// Save a double on the first iteration
   2188 	ge_zero(p);
   2189 	lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
   2190 	lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
   2191 	// Regular double & add for the rest
   2192 	for (int i = 30; i >= 0; i--) {
   2193 		ge_double(p, p, &tmp_d);
   2194 		lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
   2195 		lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
   2196 	}
   2197 	// Note: we could save one addition at the end if we assumed the
   2198 	// scalar fit in 252 bits.  Which it does in practice if it is
   2199 	// selected at random.  However, non-random, non-hashed scalars
   2200 	// *can* overflow 252 bits in practice.  Better account for that
   2201 	// than leaving that kind of subtle corner case.
   2202 
   2203 	WIPE_BUFFER(tmp_a);  WIPE_CTX(&tmp_d);
   2204 	WIPE_BUFFER(tmp_b);  WIPE_CTX(&tmp_c);
   2205 	WIPE_BUFFER(s_scalar);
   2206 }
   2207 
   2208 void crypto_eddsa_scalarbase(u8 point[32], const u8 scalar[32])
   2209 {
   2210 	ge P;
   2211 	ge_scalarmult_base(&P, scalar);
   2212 	ge_tobytes(point, &P);
   2213 	WIPE_CTX(&P);
   2214 }
   2215 
   2216 void crypto_eddsa_key_pair(u8 secret_key[64], u8 public_key[32], u8 seed[32])
   2217 {
   2218 	// To allow overlaps, observable writes happen in this order:
   2219 	// 1. seed
   2220 	// 2. secret_key
   2221 	// 3. public_key
   2222 	u8 a[64];
   2223 	COPY(a, seed, 32);
   2224 	crypto_wipe(seed, 32);
   2225 	COPY(secret_key, a, 32);
   2226 	crypto_blake2b(a, 64, a, 32);
   2227 	crypto_eddsa_trim_scalar(a, a);
   2228 	crypto_eddsa_scalarbase(secret_key + 32, a);
   2229 	COPY(public_key, secret_key + 32, 32);
   2230 	WIPE_BUFFER(a);
   2231 }
   2232 
   2233 static void hash_reduce(u8 h[32],
   2234                         const u8 *a, size_t a_size,
   2235                         const u8 *b, size_t b_size,
   2236                         const u8 *c, size_t c_size)
   2237 {
   2238 	u8 hash[64];
   2239 	crypto_blake2b_ctx ctx;
   2240 	crypto_blake2b_init  (&ctx, 64);
   2241 	crypto_blake2b_update(&ctx, a, a_size);
   2242 	crypto_blake2b_update(&ctx, b, b_size);
   2243 	crypto_blake2b_update(&ctx, c, c_size);
   2244 	crypto_blake2b_final (&ctx, hash);
   2245 	crypto_eddsa_reduce(h, hash);
   2246 }
   2247 
   2248 // Digital signature of a message with from a secret key.
   2249 //
   2250 // The secret key comprises two parts:
   2251 // - The seed that generates the key (secret_key[ 0..31])
   2252 // - The public key                  (secret_key[32..63])
   2253 //
   2254 // The seed and the public key are bundled together to make sure users
   2255 // don't use mismatched seeds and public keys, which would instantly
   2256 // leak the secret scalar and allow forgeries (allowing this to happen
   2257 // has resulted in critical vulnerabilities in the wild).
   2258 //
   2259 // The seed is hashed to derive the secret scalar and a secret prefix.
   2260 // The sole purpose of the prefix is to generate a secret random nonce.
   2261 // The properties of that nonce must be as follows:
   2262 // - Unique: we need a different one for each message.
   2263 // - Secret: third parties must not be able to predict it.
   2264 // - Random: any detectable bias would break all security.
   2265 //
   2266 // There are two ways to achieve these properties.  The obvious one is
   2267 // to simply generate a random number.  Here that would be a parameter
   2268 // (Monocypher doesn't have an RNG).  It works, but then users may reuse
   2269 // the nonce by accident, which _also_ leaks the secret scalar and
   2270 // allows forgeries.  This has happened in the wild too.
   2271 //
   2272 // This is no good, so instead we generate that nonce deterministically
   2273 // by reducing modulo L a hash of the secret prefix and the message.
   2274 // The secret prefix makes the nonce unpredictable, the message makes it
   2275 // unique, and the hash/reduce removes all bias.
   2276 //
   2277 // The cost of that safety is hashing the message twice.  If that cost
   2278 // is unacceptable, there are two alternatives:
   2279 //
   2280 // - Signing a hash of the message instead of the message itself.  This
   2281 //   is fine as long as the hash is collision resistant. It is not
   2282 //   compatible with existing "pure" signatures, but at least it's safe.
   2283 //
   2284 // - Using a random nonce.  Please exercise **EXTREME CAUTION** if you
   2285 //   ever do that.  It is absolutely **critical** that the nonce is
   2286 //   really an unbiased random number between 0 and L-1, never reused,
   2287 //   and wiped immediately.
   2288 //
   2289 //   To lower the likelihood of complete catastrophe if the RNG is
   2290 //   either flawed or misused, you can hash the RNG output together with
   2291 //   the secret prefix and the beginning of the message, and use the
   2292 //   reduction of that hash instead of the RNG output itself.  It's not
   2293 //   foolproof (you'd need to hash the whole message) but it helps.
   2294 //
   2295 // Signing a message involves the following operations:
   2296 //
   2297 //   scalar, prefix = HASH(secret_key)
   2298 //   r              = HASH(prefix || message) % L
   2299 //   R              = [r]B
   2300 //   h              = HASH(R || public_key || message) % L
   2301 //   S              = ((h * a) + r) % L
   2302 //   signature      = R || S
   2303 void crypto_eddsa_sign(u8 signature [64], const u8 secret_key[64],
   2304                        const u8 *message, size_t message_size)
   2305 {
   2306 	u8 a[64];  // secret scalar and prefix
   2307 	u8 r[32];  // secret deterministic "random" nonce
   2308 	u8 h[32];  // publically verifiable hash of the message (not wiped)
   2309 	u8 R[32];  // first half of the signature (allows overlapping inputs)
   2310 
   2311 	crypto_blake2b(a, 64, secret_key, 32);
   2312 	crypto_eddsa_trim_scalar(a, a);
   2313 	hash_reduce(r, a + 32, 32, message, message_size, 0, 0);
   2314 	crypto_eddsa_scalarbase(R, r);
   2315 	hash_reduce(h, R, 32, secret_key + 32, 32, message, message_size);
   2316 	COPY(signature, R, 32);
   2317 	crypto_eddsa_mul_add(signature + 32, h, a, r);
   2318 
   2319 	WIPE_BUFFER(a);
   2320 	WIPE_BUFFER(r);
   2321 }
   2322 
   2323 // To check the signature R, S of the message M with the public key A,
   2324 // there are 3 steps:
   2325 //
   2326 //   compute h = HASH(R || A || message) % L
   2327 //   check that A is on the curve.
   2328 //   check that R == [s]B - [h]A
   2329 //
   2330 // The last two steps are done in crypto_eddsa_check_equation()
   2331 int crypto_eddsa_check(const u8  signature[64], const u8 public_key[32],
   2332                        const u8 *message, size_t message_size)
   2333 {
   2334 	u8 h[32];
   2335 	hash_reduce(h, signature, 32, public_key, 32, message, message_size);
   2336 	return crypto_eddsa_check_equation(signature, public_key, h);
   2337 }
   2338 
   2339 /////////////////////////
   2340 /// EdDSA <--> X25519 ///
   2341 /////////////////////////
   2342 void crypto_eddsa_to_x25519(u8 x25519[32], const u8 eddsa[32])
   2343 {
   2344 	// (u, v) = ((1+y)/(1-y), sqrt(-486664)*u/x)
   2345 	// Only converting y to u, the sign of x is ignored.
   2346 	fe t1, t2;
   2347 	fe_frombytes(t2, eddsa);
   2348 	fe_add(t1, fe_one, t2);
   2349 	fe_sub(t2, fe_one, t2);
   2350 	fe_invert(t2, t2);
   2351 	fe_mul(t1, t1, t2);
   2352 	fe_tobytes(x25519, t1);
   2353 	WIPE_BUFFER(t1);
   2354 	WIPE_BUFFER(t2);
   2355 }
   2356 
   2357 void crypto_x25519_to_eddsa(u8 eddsa[32], const u8 x25519[32])
   2358 {
   2359 	// (x, y) = (sqrt(-486664)*u/v, (u-1)/(u+1))
   2360 	// Only converting u to y, x is assumed positive.
   2361 	fe t1, t2;
   2362 	fe_frombytes(t2, x25519);
   2363 	fe_sub(t1, t2, fe_one);
   2364 	fe_add(t2, t2, fe_one);
   2365 	fe_invert(t2, t2);
   2366 	fe_mul(t1, t1, t2);
   2367 	fe_tobytes(eddsa, t1);
   2368 	WIPE_BUFFER(t1);
   2369 	WIPE_BUFFER(t2);
   2370 }
   2371 
   2372 /////////////////////////////////////////////
   2373 /// Dirty ephemeral public key generation ///
   2374 /////////////////////////////////////////////
   2375 
   2376 // Those functions generates a public key, *without* clearing the
   2377 // cofactor.  Sending that key over the network leaks 3 bits of the
   2378 // private key.  Use only to generate ephemeral keys that will be hidden
   2379 // with crypto_curve_to_hidden().
   2380 //
   2381 // The public key is otherwise compatible with crypto_x25519(), which
   2382 // properly clears the cofactor.
   2383 //
   2384 // Note that the distribution of the resulting public keys is almost
   2385 // uniform.  Flipping the sign of the v coordinate (not provided by this
   2386 // function), covers the entire key space almost perfectly, where
   2387 // "almost" means a 2^-128 bias (undetectable).  This uniformity is
   2388 // needed to ensure the proper randomness of the resulting
   2389 // representatives (once we apply crypto_curve_to_hidden()).
   2390 //
   2391 // Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
   2392 // to be confused with the prime order of the main subgroup, L, which is
   2393 // 8 times less than that).
   2394 //
   2395 // Generating all points would require us to multiply a point of order C
   2396 // (the base point plus any point of order 8) by all scalars from 0 to
   2397 // C-1.  Clamping limits us to scalars between 2^254 and 2^255 - 1. But
   2398 // by negating the resulting point at random, we also cover scalars from
   2399 // -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
   2400 //
   2401 // In practice:
   2402 // - Scalars from 0         to e + 1     are never generated
   2403 // - Scalars from 2^255     to 2^255 + e are never generated
   2404 // - Scalars from 2^254 + 1 to 2^254 + e are generated twice
   2405 //
   2406 // Since e < 2^128, detecting this bias requires observing over 2^100
   2407 // representatives from a given source (this will never happen), *and*
   2408 // recovering enough of the private key to determine that they do, or do
   2409 // not, belong to the biased set (this practically requires solving
   2410 // discrete logarithm, which is conjecturally intractable).
   2411 //
   2412 // In practice, this means the bias is impossible to detect.
   2413 
   2414 // s + (x*L) % 8*L
   2415 // Guaranteed to fit in 256 bits iff s fits in 255 bits.
   2416 //   L             < 2^253
   2417 //   x%8           < 2^3
   2418 //   L * (x%8)     < 2^255
   2419 //   s             < 2^255
   2420 //   s + L * (x%8) < 2^256
   2421 static void add_xl(u8 s[32], u8 x)
   2422 {
   2423 	u64 mod8  = x & 7;
   2424 	u64 carry = 0;
   2425 	FOR (i , 0, 8) {
   2426 		carry = carry + load32_le(s + 4*i) + L[i] * mod8;
   2427 		store32_le(s + 4*i, (u32)carry);
   2428 		carry >>= 32;
   2429 	}
   2430 }
   2431 
   2432 // "Small" dirty ephemeral key.
   2433 // Use if you need to shrink the size of the binary, and can afford to
   2434 // slow down by a factor of two (compared to the fast version)
   2435 //
   2436 // This version works by decoupling the cofactor from the main factor.
   2437 //
   2438 // - The trimmed scalar determines the main factor
   2439 // - The clamped bits of the scalar determine the cofactor.
   2440 //
   2441 // Cofactor and main factor are combined into a single scalar, which is
   2442 // then multiplied by a point of order 8*L (unlike the base point, which
   2443 // has prime order).  That "dirty" base point is the addition of the
   2444 // regular base point (9), and a point of order 8.
   2445 void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
   2446 {
   2447 	// Base point of order 8*L
   2448 	// Raw scalar multiplication with it does not clear the cofactor,
   2449 	// and the resulting public key will reveal 3 bits of the scalar.
   2450 	//
   2451 	// The low order component of this base point  has been chosen
   2452 	// to yield the same results as crypto_x25519_dirty_fast().
   2453 	static const u8 dirty_base_point[32] = {
   2454 		0xd8, 0x86, 0x1a, 0xa2, 0x78, 0x7a, 0xd9, 0x26,
   2455 		0x8b, 0x74, 0x74, 0xb6, 0x82, 0xe3, 0xbe, 0xc3,
   2456 		0xce, 0x36, 0x9a, 0x1e, 0x5e, 0x31, 0x47, 0xa2,
   2457 		0x6d, 0x37, 0x7c, 0xfd, 0x20, 0xb5, 0xdf, 0x75,
   2458 	};
   2459 	// separate the main factor & the cofactor of the scalar
   2460 	u8 scalar[32];
   2461 	crypto_eddsa_trim_scalar(scalar, secret_key);
   2462 
   2463 	// Separate the main factor and the cofactor
   2464 	//
   2465 	// The scalar is trimmed, so its cofactor is cleared.  The three
   2466 	// least significant bits however still have a main factor.  We must
   2467 	// remove it for X25519 compatibility.
   2468 	//
   2469 	//   cofactor = lsb * L            (modulo 8*L)
   2470 	//   combined = scalar + cofactor  (modulo 8*L)
   2471 	add_xl(scalar, secret_key[0]);
   2472 	scalarmult(public_key, scalar, dirty_base_point, 256);
   2473 	WIPE_BUFFER(scalar);
   2474 }
   2475 
   2476 // Select low order point
   2477 // We're computing the [cofactor]lop scalar multiplication, where:
   2478 //
   2479 //   cofactor = tweak & 7.
   2480 //   lop      = (lop_x, lop_y)
   2481 //   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
   2482 //   lop_y    = -lop_x * sqrtm1
   2483 //
   2484 // The low order point has order 8. There are 4 such points.  We've
   2485 // chosen the one whose both coordinates are positive (below p/2).
   2486 // The 8 low order points are as follows:
   2487 //
   2488 // [0]lop = ( 0       ,  1    )
   2489 // [1]lop = ( lop_x   ,  lop_y)
   2490 // [2]lop = ( sqrt(-1), -0    )
   2491 // [3]lop = ( lop_x   , -lop_y)
   2492 // [4]lop = (-0       , -1    )
   2493 // [5]lop = (-lop_x   , -lop_y)
   2494 // [6]lop = (-sqrt(-1),  0    )
   2495 // [7]lop = (-lop_x   ,  lop_y)
   2496 //
   2497 // The x coordinate is either 0, sqrt(-1), lop_x, or their opposite.
   2498 // The y coordinate is either 0,      -1 , lop_y, or their opposite.
   2499 // The pattern for both is the same, except for a rotation of 2 (modulo 8)
   2500 //
   2501 // This helper function captures the pattern, and we can use it thus:
   2502 //
   2503 //    select_lop(x, lop_x, sqrtm1, cofactor);
   2504 //    select_lop(y, lop_y, fe_one, cofactor + 2);
   2505 //
   2506 // This is faster than an actual scalar multiplication,
   2507 // and requires less code than naive constant time look up.
   2508 static void select_lop(fe out, const fe x, const fe k, u8 cofactor)
   2509 {
   2510 	fe tmp;
   2511 	fe_0(out);
   2512 	fe_ccopy(out, k  , (cofactor >> 1) & 1); // bit 1
   2513 	fe_ccopy(out, x  , (cofactor >> 0) & 1); // bit 0
   2514 	fe_neg  (tmp, out);
   2515 	fe_ccopy(out, tmp, (cofactor >> 2) & 1); // bit 2
   2516 	WIPE_BUFFER(tmp);
   2517 }
   2518 
   2519 // "Fast" dirty ephemeral key
   2520 // We use this one by default.
   2521 //
   2522 // This version works by performing a regular scalar multiplication,
   2523 // then add a low order point.  The scalar multiplication is done in
   2524 // Edwards space for more speed (*2 compared to the "small" version).
   2525 // The cost is a bigger binary for programs that don't also sign messages.
   2526 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
   2527 {
   2528 	// Compute clean scalar multiplication
   2529 	u8 scalar[32];
   2530 	ge pk;
   2531 	crypto_eddsa_trim_scalar(scalar, secret_key);
   2532 	ge_scalarmult_base(&pk, scalar);
   2533 
   2534 	// Compute low order point
   2535 	fe t1, t2;
   2536 	select_lop(t1, lop_x, sqrtm1, secret_key[0]);
   2537 	select_lop(t2, lop_y, fe_one, secret_key[0] + 2);
   2538 	ge_precomp low_order_point;
   2539 	fe_add(low_order_point.Yp, t2, t1);
   2540 	fe_sub(low_order_point.Ym, t2, t1);
   2541 	fe_mul(low_order_point.T2, t2, t1);
   2542 	fe_mul(low_order_point.T2, low_order_point.T2, D2);
   2543 
   2544 	// Add low order point to the public key
   2545 	ge_madd(&pk, &pk, &low_order_point, t1, t2);
   2546 
   2547 	// Convert to Montgomery u coordinate (we ignore the sign)
   2548 	fe_add(t1, pk.Z, pk.Y);
   2549 	fe_sub(t2, pk.Z, pk.Y);
   2550 	fe_invert(t2, t2);
   2551 	fe_mul(t1, t1, t2);
   2552 
   2553 	fe_tobytes(public_key, t1);
   2554 
   2555 	WIPE_BUFFER(t1);    WIPE_CTX(&pk);
   2556 	WIPE_BUFFER(t2);    WIPE_CTX(&low_order_point);
   2557 	WIPE_BUFFER(scalar);
   2558 }
   2559 
   2560 ///////////////////
   2561 /// Elligator 2 ///
   2562 ///////////////////
   2563 static const fe A = {486662};
   2564 
   2565 // Elligator direct map
   2566 //
   2567 // Computes the point corresponding to a representative, encoded in 32
   2568 // bytes (little Endian).  Since positive representatives fits in 254
   2569 // bits, The two most significant bits are ignored.
   2570 //
   2571 // From the paper:
   2572 // w = -A / (fe(1) + non_square * r^2)
   2573 // e = chi(w^3 + A*w^2 + w)
   2574 // u = e*w - (fe(1)-e)*(A//2)
   2575 // v = -e * sqrt(u^3 + A*u^2 + u)
   2576 //
   2577 // We ignore v because we don't need it for X25519 (the Montgomery
   2578 // ladder only uses u).
   2579 //
   2580 // Note that e is either 0, 1 or -1
   2581 // if e = 0    u = 0  and v = 0
   2582 // if e = 1    u = w
   2583 // if e = -1   u = -w - A = w * non_square * r^2
   2584 //
   2585 // Let r1 = non_square * r^2
   2586 // Let r2 = 1 + r1
   2587 // Note that r2 cannot be zero, -1/non_square is not a square.
   2588 // We can (tediously) verify that:
   2589 //   w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
   2590 // Therefore:
   2591 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
   2592 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
   2593 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
   2594 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)  *     r2^6)
   2595 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) *  A * r2^3)
   2596 // Corollary:
   2597 //   e =  1 if (A^2*r1 - r2^2) *  A * r2^3) is a non-zero square
   2598 //   e = -1 if (A^2*r1 - r2^2) *  A * r2^3) is not a square
   2599 //   Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
   2600 //     w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
   2601 //     w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
   2602 //     w^3 + A*w^2 + w = w * (w + A/2)^2        - A^2/4 + 1)
   2603 //     which is zero only if:
   2604 //       w = 0                   (impossible)
   2605 //       (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
   2606 //
   2607 // Let isr   = invsqrt((A^2*r1 - r2^2) *  A * r2^3)
   2608 //     isr   = sqrt(1        / ((A^2*r1 - r2^2) *  A * r2^3)) if e =  1
   2609 //     isr   = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) *  A * r2^3)) if e = -1
   2610 //
   2611 // if e = 1
   2612 //   let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
   2613 //       u1 = w
   2614 //       u1 = u
   2615 //
   2616 // if e = -1
   2617 //   let ufactor = -non_square * sqrt(-1) * r^2
   2618 //   let vfactor = sqrt(ufactor)
   2619 //   let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
   2620 //       u2 = w * -1 * -non_square * r^2
   2621 //       u2 = w * non_square * r^2
   2622 //       u2 = u
   2623 void crypto_elligator_map(u8 curve[32], const u8 hidden[32])
   2624 {
   2625 	fe r, u, t1, t2, t3;
   2626 	fe_frombytes_mask(r, hidden, 2); // r is encoded in 254 bits.
   2627 	fe_sq(r, r);
   2628 	fe_add(t1, r, r);
   2629 	fe_add(u, t1, fe_one);
   2630 	fe_sq (t2, u);
   2631 	fe_mul(t3, A2, t1);
   2632 	fe_sub(t3, t3, t2);
   2633 	fe_mul(t3, t3, A);
   2634 	fe_mul(t1, t2, u);
   2635 	fe_mul(t1, t3, t1);
   2636 	int is_square = invsqrt(t1, t1);
   2637 	fe_mul(u, r, ufactor);
   2638 	fe_ccopy(u, fe_one, is_square);
   2639 	fe_sq (t1, t1);
   2640 	fe_mul(u, u, A);
   2641 	fe_mul(u, u, t3);
   2642 	fe_mul(u, u, t2);
   2643 	fe_mul(u, u, t1);
   2644 	fe_neg(u, u);
   2645 	fe_tobytes(curve, u);
   2646 
   2647 	WIPE_BUFFER(t1);  WIPE_BUFFER(r);
   2648 	WIPE_BUFFER(t2);  WIPE_BUFFER(u);
   2649 	WIPE_BUFFER(t3);
   2650 }
   2651 
   2652 // Elligator inverse map
   2653 //
   2654 // Computes the representative of a point, if possible.  If not, it does
   2655 // nothing and returns -1.  Note that the success of the operation
   2656 // depends only on the point (more precisely its u coordinate).  The
   2657 // tweak parameter is used only upon success
   2658 //
   2659 // The tweak should be a random byte.  Beyond that, its contents are an
   2660 // implementation detail. Currently, the tweak comprises:
   2661 // - Bit  1  : sign of the v coordinate (0 if positive, 1 if negative)
   2662 // - Bit  2-5: not used
   2663 // - Bits 6-7: random padding
   2664 //
   2665 // From the paper:
   2666 // Let sq = -non_square * u * (u+A)
   2667 // if sq is not a square, or u = -A, there is no mapping
   2668 // Assuming there is a mapping:
   2669 //    if v is positive: r = sqrt(-u     / (non_square * (u+A)))
   2670 //    if v is negative: r = sqrt(-(u+A) / (non_square * u    ))
   2671 //
   2672 // We compute isr = invsqrt(-non_square * u * (u+A))
   2673 // if it wasn't a square, abort.
   2674 // else, isr = sqrt(-1 / (non_square * u * (u+A))
   2675 //
   2676 // If v is positive, we return isr * u:
   2677 //   isr * u = sqrt(-1 / (non_square * u * (u+A)) * u
   2678 //   isr * u = sqrt(-u / (non_square * (u+A))
   2679 //
   2680 // If v is negative, we return isr * (u+A):
   2681 //   isr * (u+A) = sqrt(-1     / (non_square * u * (u+A)) * (u+A)
   2682 //   isr * (u+A) = sqrt(-(u+A) / (non_square * u)
   2683 int crypto_elligator_rev(u8 hidden[32], const u8 public_key[32], u8 tweak)
   2684 {
   2685 	fe t1, t2, t3;
   2686 	fe_frombytes(t1, public_key);    // t1 = u
   2687 
   2688 	fe_add(t2, t1, A);               // t2 = u + A
   2689 	fe_mul(t3, t1, t2);
   2690 	fe_mul_small(t3, t3, -2);
   2691 	int is_square = invsqrt(t3, t3); // t3 = sqrt(-1 / non_square * u * (u+A))
   2692 	if (is_square) {
   2693 		// The only variable time bit.  This ultimately reveals how many
   2694 		// tries it took us to find a representable key.
   2695 		// This does not affect security as long as we try keys at random.
   2696 
   2697 		fe_ccopy    (t1, t2, tweak & 1); // multiply by u if v is positive,
   2698 		fe_mul      (t3, t1, t3);        // multiply by u+A otherwise
   2699 		fe_mul_small(t1, t3, 2);
   2700 		fe_neg      (t2, t3);
   2701 		fe_ccopy    (t3, t2, fe_isodd(t1));
   2702 		fe_tobytes(hidden, t3);
   2703 
   2704 		// Pad with two random bits
   2705 		hidden[31] |= tweak & 0xc0;
   2706 	}
   2707 
   2708 	WIPE_BUFFER(t1);
   2709 	WIPE_BUFFER(t2);
   2710 	WIPE_BUFFER(t3);
   2711 	return is_square - 1;
   2712 }
   2713 
   2714 void crypto_elligator_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
   2715 {
   2716 	u8 pk [32]; // public key
   2717 	u8 buf[64]; // seed + representative
   2718 	COPY(buf + 32, seed, 32);
   2719 	do {
   2720 		crypto_chacha20_djb(buf, 0, 64, buf+32, zero, 0);
   2721 		crypto_x25519_dirty_fast(pk, buf); // or the "small" version
   2722 	} while(crypto_elligator_rev(buf+32, pk, buf[32]));
   2723 	// Note that the return value of crypto_elligator_rev() is
   2724 	// independent from its tweak parameter.
   2725 	// Therefore, buf[32] is not actually reused.  Either we loop one
   2726 	// more time and buf[32] is used for the new seed, or we succeeded,
   2727 	// and buf[32] becomes the tweak parameter.
   2728 
   2729 	crypto_wipe(seed, 32);
   2730 	COPY(hidden    , buf + 32, 32);
   2731 	COPY(secret_key, buf     , 32);
   2732 	WIPE_BUFFER(buf);
   2733 	WIPE_BUFFER(pk);
   2734 }
   2735 
   2736 ///////////////////////
   2737 /// Scalar division ///
   2738 ///////////////////////
   2739 
   2740 // Montgomery reduction.
   2741 // Divides x by (2^256), and reduces the result modulo L
   2742 //
   2743 // Precondition:
   2744 //   x < L * 2^256
   2745 // Constants:
   2746 //   r = 2^256                 (makes division by r trivial)
   2747 //   k = (r * (1/r) - 1) // L  (1/r is computed modulo L   )
   2748 // Algorithm:
   2749 //   s = (x * k) % r
   2750 //   t = x + s*L      (t is always a multiple of r)
   2751 //   u = (t/r) % L    (u is always below 2*L, conditional subtraction is enough)
   2752 static void redc(u32 u[8], u32 x[16])
   2753 {
   2754 	static const u32 k[8] = {
   2755 		0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
   2756 		0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,
   2757 	};
   2758 
   2759 	// s = x * k (modulo 2^256)
   2760 	// This is cheaper than the full multiplication.
   2761 	u32 s[8] = {0};
   2762 	FOR (i, 0, 8) {
   2763 		u64 carry = 0;
   2764 		FOR (j, 0, 8-i) {
   2765 			carry  += s[i+j] + (u64)x[i] * k[j];
   2766 			s[i+j]  = (u32)carry;
   2767 			carry >>= 32;
   2768 		}
   2769 	}
   2770 	u32 t[16] = {0};
   2771 	multiply(t, s, L);
   2772 
   2773 	// t = t + x
   2774 	u64 carry = 0;
   2775 	FOR (i, 0, 16) {
   2776 		carry  += (u64)t[i] + x[i];
   2777 		t[i]    = (u32)carry;
   2778 		carry >>= 32;
   2779 	}
   2780 
   2781 	// u = (t / 2^256) % L
   2782 	// Note that t / 2^256 is always below 2*L,
   2783 	// So a constant time conditional subtraction is enough
   2784 	remove_l(u, t+8);
   2785 
   2786 	WIPE_BUFFER(s);
   2787 	WIPE_BUFFER(t);
   2788 }
   2789 
   2790 void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
   2791                            const u8 curve_point[32])
   2792 {
   2793 	static const  u8 Lm2[32] = { // L - 2
   2794 		0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
   2795 		0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
   2796 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   2797 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
   2798 	};
   2799 	// 1 in Montgomery form
   2800 	u32 m_inv [8] = {
   2801 		0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
   2802 		0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,
   2803 	};
   2804 
   2805 	u8 scalar[32];
   2806 	crypto_eddsa_trim_scalar(scalar, private_key);
   2807 
   2808 	// Convert the scalar in Montgomery form
   2809 	// m_scl = scalar * 2^256 (modulo L)
   2810 	u32 m_scl[8];
   2811 	{
   2812 		u32 tmp[16];
   2813 		ZERO(tmp, 8);
   2814 		load32_le_buf(tmp+8, scalar, 8);
   2815 		mod_l(scalar, tmp);
   2816 		load32_le_buf(m_scl, scalar, 8);
   2817 		WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
   2818 	}
   2819 
   2820 	// Compute the inverse
   2821 	u32 product[16];
   2822 	for (int i = 252; i >= 0; i--) {
   2823 		ZERO(product, 16);
   2824 		multiply(product, m_inv, m_inv);
   2825 		redc(m_inv, product);
   2826 		if (scalar_bit(Lm2, i)) {
   2827 			ZERO(product, 16);
   2828 			multiply(product, m_inv, m_scl);
   2829 			redc(m_inv, product);
   2830 		}
   2831 	}
   2832 	// Convert the inverse *out* of Montgomery form
   2833 	// scalar = m_inv / 2^256 (modulo L)
   2834 	COPY(product, m_inv, 8);
   2835 	ZERO(product + 8, 8);
   2836 	redc(m_inv, product);
   2837 	store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
   2838 
   2839 	// Clear the cofactor of scalar:
   2840 	//   cleared = scalar * (3*L + 1)      (modulo 8*L)
   2841 	//   cleared = scalar + scalar * 3 * L (modulo 8*L)
   2842 	// Note that (scalar * 3) is reduced modulo 8, so we only need the
   2843 	// first byte.
   2844 	add_xl(scalar, scalar[0] * 3);
   2845 
   2846 	// Recall that 8*L < 2^256. However it is also very close to
   2847 	// 2^255. If we spanned the ladder over 255 bits, random tests
   2848 	// wouldn't catch the off-by-one error.
   2849 	scalarmult(blind_salt, scalar, curve_point, 256);
   2850 
   2851 	WIPE_BUFFER(scalar);   WIPE_BUFFER(m_scl);
   2852 	WIPE_BUFFER(product);  WIPE_BUFFER(m_inv);
   2853 }
   2854 
   2855 ////////////////////////////////
   2856 /// Authenticated encryption ///
   2857 ////////////////////////////////
   2858 static void lock_auth(u8 mac[16], const u8  auth_key[32],
   2859                       const u8 *ad         , size_t ad_size,
   2860                       const u8 *cipher_text, size_t text_size)
   2861 {
   2862 	u8 sizes[16]; // Not secret, not wiped
   2863 	store64_le(sizes + 0, ad_size);
   2864 	store64_le(sizes + 8, text_size);
   2865 	crypto_poly1305_ctx poly_ctx;           // auto wiped...
   2866 	crypto_poly1305_init  (&poly_ctx, auth_key);
   2867 	crypto_poly1305_update(&poly_ctx, ad         , ad_size);
   2868 	crypto_poly1305_update(&poly_ctx, zero       , gap(ad_size, 16));
   2869 	crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
   2870 	crypto_poly1305_update(&poly_ctx, zero       , gap(text_size, 16));
   2871 	crypto_poly1305_update(&poly_ctx, sizes      , 16);
   2872 	crypto_poly1305_final (&poly_ctx, mac); // ...here
   2873 }
   2874 
   2875 void crypto_aead_init_x(crypto_aead_ctx *ctx,
   2876                         u8 const key[32], const u8 nonce[24])
   2877 {
   2878 	crypto_chacha20_h(ctx->key, key, nonce);
   2879 	COPY(ctx->nonce, nonce + 16, 8);
   2880 	ctx->counter = 0;
   2881 }
   2882 
   2883 void crypto_aead_init_djb(crypto_aead_ctx *ctx,
   2884                           const u8 key[32], const u8 nonce[8])
   2885 {
   2886 	COPY(ctx->key  , key  , 32);
   2887 	COPY(ctx->nonce, nonce,  8);
   2888 	ctx->counter = 0;
   2889 }
   2890 
   2891 void crypto_aead_init_ietf(crypto_aead_ctx *ctx,
   2892                            const u8 key[32], const u8 nonce[12])
   2893 {
   2894 	COPY(ctx->key  , key      , 32);
   2895 	COPY(ctx->nonce, nonce + 4,  8);
   2896 	ctx->counter = (u64)load32_le(nonce) << 32;
   2897 }
   2898 
   2899 void crypto_aead_write(crypto_aead_ctx *ctx, u8 *cipher_text, u8 mac[16],
   2900                        const u8 *ad,         size_t ad_size,
   2901                        const u8 *plain_text, size_t text_size)
   2902 {
   2903 	u8 auth_key[64]; // the last 32 bytes are used for rekeying.
   2904 	crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter);
   2905 	crypto_chacha20_djb(cipher_text, plain_text, text_size,
   2906 	                    ctx->key, ctx->nonce, ctx->counter + 1);
   2907 	lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
   2908 	COPY(ctx->key, auth_key + 32, 32);
   2909 	WIPE_BUFFER(auth_key);
   2910 }
   2911 
   2912 int crypto_aead_read(crypto_aead_ctx *ctx, u8 *plain_text, const u8 mac[16],
   2913                      const u8 *ad,          size_t ad_size,
   2914                      const u8 *cipher_text, size_t text_size)
   2915 {
   2916 	u8 auth_key[64]; // the last 32 bytes are used for rekeying.
   2917 	u8 real_mac[16];
   2918 	crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter);
   2919 	lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
   2920 	int mismatch = crypto_verify16(mac, real_mac);
   2921 	if (!mismatch) {
   2922 		crypto_chacha20_djb(plain_text, cipher_text, text_size,
   2923 		                    ctx->key, ctx->nonce, ctx->counter + 1);
   2924 		COPY(ctx->key, auth_key + 32, 32);
   2925 	}
   2926 	WIPE_BUFFER(auth_key);
   2927 	WIPE_BUFFER(real_mac);
   2928 	return mismatch;
   2929 }
   2930 
   2931 void crypto_aead_lock(u8 *cipher_text, u8 mac[16], const u8 key[32],
   2932                       const u8  nonce[24], const u8 *ad, size_t ad_size,
   2933                       const u8 *plain_text, size_t text_size)
   2934 {
   2935 	crypto_aead_ctx ctx;
   2936 	crypto_aead_init_x(&ctx, key, nonce);
   2937 	crypto_aead_write(&ctx, cipher_text, mac, ad, ad_size,
   2938 	                  plain_text, text_size);
   2939 	crypto_wipe(&ctx, sizeof(ctx));
   2940 }
   2941 
   2942 int crypto_aead_unlock(u8 *plain_text, const u8  mac[16], const u8 key[32],
   2943                        const u8 nonce[24], const u8 *ad, size_t ad_size,
   2944                        const u8 *cipher_text, size_t text_size)
   2945 {
   2946 	crypto_aead_ctx ctx;
   2947 	crypto_aead_init_x(&ctx, key, nonce);
   2948 	int mismatch = crypto_aead_read(&ctx, plain_text, mac, ad, ad_size,
   2949 	                                cipher_text, text_size);
   2950 	crypto_wipe(&ctx, sizeof(ctx));
   2951 	return mismatch;
   2952 }
   2953 
   2954 #ifdef MONOCYPHER_CPP_NAMESPACE
   2955 }
   2956 #endif