kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 2b750a220d91fbbd5b41fbed5be909b1445eb1c7
parent 56f95449e7d0bcdd0570ee1898933546cfa4e54c
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 18:15:35 -0700

pkg: vendor deps

Diffstat:
MMakefile | 2++
Mdoc/DISTRIBUTE.md | 62++++++++++++++++++++++++++++----------------------------------
Mdriver/dist/blake2b.c | 37++++---------------------------------
Mdriver/dist/blake2b.h | 12++++--------
Mdriver/dist/cfpkg.c | 8++++----
Mdriver/dist/cfpkg.h | 2+-
Mdriver/dist/deflate.c | 2103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mdriver/dist/deflate.h | 14+++++---------
Mdriver/dist/dist.h | 15+++++++--------
Mdriver/dist/ed25519.c | 32++++++--------------------------
Mdriver/dist/ed25519.h | 8+-------
Mdriver/dist/lz4.c | 41++++++++++++++++++++++++++---------------
Mdriver/dist/minisig.c | 29+++++++++++++++++++----------
Mdriver/dist/minisig.h | 15+++++++--------
Adriver/dist/vendor/lz4/LICENSE | 24++++++++++++++++++++++++
Adriver/dist/vendor/lz4/lz4.c | 2829+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adriver/dist/vendor/lz4/lz4.h | 884+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adriver/dist/vendor/monocypher/LICENCE.md | 167+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adriver/dist/vendor/monocypher/monocypher-ed25519.c | 500+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adriver/dist/vendor/monocypher/monocypher-ed25519.h | 140+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adriver/dist/vendor/monocypher/monocypher.c | 2956+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adriver/dist/vendor/monocypher/monocypher.h | 321+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdriver/pkg.c | 2+-
Atest/pkg/run.sh | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/test.mk | 6+++++-
25 files changed, 10085 insertions(+), 216 deletions(-)

diff --git a/Makefile b/Makefile @@ -351,6 +351,8 @@ ifeq ($(CFREE_TOOL_PKG_ENABLED),1) DRIVER_TOOL_SRCS += driver/pkg.c DRIVER_TOOL_SRCS += driver/dist/dist.c driver/dist/b64.c \ driver/dist/blake2b.c driver/dist/ed25519.c \ + driver/dist/vendor/monocypher/monocypher.c \ + driver/dist/vendor/monocypher/monocypher-ed25519.c \ driver/dist/tar.c driver/dist/deflate.c \ driver/dist/lz4.c driver/dist/cfpkg.c \ driver/dist/manifest.c driver/dist/minisig.c \ diff --git a/doc/DISTRIBUTE.md b/doc/DISTRIBUTE.md @@ -15,7 +15,7 @@ The signed object is always the logical manifest's literal byte stream. The package id is: ``` -package-id = BLAKE2b-512(logical manifest literal bytes) +package-id = BLAKE2b-256(logical manifest literal bytes) ``` The detached minisign signature covers the manifest. Its trusted comment also @@ -45,15 +45,16 @@ part of the v2 package format. | Primitive | Purpose | Status | |---|---|---| | tar | portable archive container | real | -| gzip/DEFLATE | portable compression | gzip/stored blocks real; compressed DEFLATE stubbed | -| BLAKE2b | package id, whole-file hashes, Merkle hashing, minisign prehash | API real; math stubbed | -| Ed25519 | minisign signature scheme | API real; math stubbed | +| gzip/DEFLATE | portable compression | real; miniz-derived raw DEFLATE | +| BLAKE2b-256 | package id, whole-file hashes, Merkle hashing | real; Monocypher 4.0.2 | +| minisign prehash | detached signature compatibility | real; stock minisign-compatible | +| Ed25519 | minisign signature scheme | real; Monocypher 4.0.2 | | base64 | minisign key/signature text | real | -| LZ4 block | native chunk compression | API real; implementation stubbed | +| LZ4 block | native chunk compression | real; upstream liblz4 1.10.0 | -The stubs are deterministic and insecure. They let the package pipeline run -end-to-end now; replacing the stub bodies with real vendored primitives should -not require format or caller changes. +The compression APIs are deterministic and vendored into the driver-side +distribution subsystem; package format callers do not depend on external +compression libraries. ## Logical manifest @@ -75,7 +76,7 @@ id = 0 path = bin/hello kind = exe size = 16384 -blake2b = <whole-file-blake2b-512> +blake2b = <whole-file-blake2b-256> root = <artifact-merkle-root> entry = true @@ -103,7 +104,7 @@ Top-level fields: | `path` | yes | unpacked artifact path | | `kind` | yes | `exe`, `dso`, `obj`, `wasm`, `lib`, `data`, or `source` | | `size` | yes | uncompressed byte length | -| `blake2b` | yes | BLAKE2b-512 of the whole artifact bytes | +| `blake2b` | yes | BLAKE2b-256 of the whole artifact bytes | | `root` | yes | artifact Merkle root | | `target` | no | cfree target triple | | `entry` | no | `true` if runnable under jit/emu/wasm | @@ -123,10 +124,10 @@ Artifacts are split into fixed 64 KiB raw chunks. The final chunk may be shorter. The tree is deterministic and deliberately simple: ``` -leaf = BLAKE2b("cfpkg2 leaf v1" || artifact-id || chunk-index || - raw-size || raw-bytes) -node = BLAKE2b("cfpkg2 node v1" || left-hash || right-hash) -root = BLAKE2b("cfpkg2 root v1" || "artifact" || top-hash) +leaf = BLAKE2b-256("cfpkg2 leaf v1" || artifact-id || chunk-index || + raw-size || raw-bytes) +node = BLAKE2b-256("cfpkg2 node v1" || left-hash || right-hash) +root = BLAKE2b-256("cfpkg2 root v1" || "artifact" || top-hash) ``` At each level, adjacent hashes are paired left-to-right. If a level has an odd @@ -134,7 +135,7 @@ final hash, that hash is promoted unchanged to the next level. Empty artifacts use a separate domain: ``` -root = BLAKE2b("cfpkg2 root v1" || "artifact-empty" || artifact-id || 0) +root = BLAKE2b-256("cfpkg2 root v1" || "artifact-empty" || artifact-id || 0) ``` This avoids virtual padding, duplicated leaves, and power-of-two tree rules. @@ -203,20 +204,20 @@ manifest: ```ini cfree-encoding 2 -package-id = <BLAKE2b-512 of logical manifest> +package-id = <BLAKE2b-256 of logical manifest> format = cfpkg hash = blake2b-merkle-v1 index-offset = 1024 -index-size = 240 +index-size = 144 index-root = <authenticated index region root> -content-offset = 1264 +content-offset = 1168 content-size = 16384 content-root = <authenticated content region root> chunk-size = 65536 alignment = 16 ``` -Each binary index record is fixed-size and little-endian: +Each binary index record is 144 bytes and little-endian: ``` artifact-id u64 @@ -226,9 +227,9 @@ stored-size u64 raw-size u64 compression u32 # 0 = none, 1 = lz4-block-v1 reserved u32 -stored-hash BLAKE2b-512 -raw-hash BLAKE2b-512 -leaf-hash BLAKE2b-512 +stored-hash BLAKE2b-256 +raw-hash BLAKE2b-256 +leaf-hash BLAKE2b-256 ``` Native verification: @@ -239,10 +240,9 @@ Native verification: 4. Confirm descriptor `package-id`, offsets, sizes, roots, chunk size, and alignment match the container. 5. Verify stored chunk hashes before decoding. -6. Decode each chunk (`none` is implemented; `lz4-block-v1` is recognized but - stubbed until LZ4 is vendored). +6. Decode each chunk (`none` and `lz4-block-v1` are implemented). 7. Verify raw chunk hashes, leaf hashes, artifact roots, and whole-file - BLAKE2b hashes. + BLAKE2b-256 hashes. ## CLI @@ -269,13 +269,7 @@ Implemented: - native `.cfpkg` create/verify/unpack/inspect, - minisign-compatible key/signature file layout, - trusted-keys store and opt-in TOFU, -- BLAKE2b streaming API and package Merkle helpers, -- raw LZ4 block API stubs, +- Monocypher-backed BLAKE2b streaming API and package Merkle helpers, +- Monocypher-backed Ed25519 minisign key/signature operations, +- raw LZ4 block compression/decompression, - driver-side SHA-256 removal for distribution. - -Still stubbed/insecure: - -- `driver/dist/blake2b.c`, -- `driver/dist/ed25519.c`, -- compressed DEFLATE support in `driver/dist/deflate.c`, -- raw LZ4 block compression/decompression in `driver/dist/lz4.c`. diff --git a/driver/dist/blake2b.c b/driver/dist/blake2b.c @@ -1,47 +1,18 @@ #include "blake2b.h" -/* STUB digest. See blake2b.h. A counter-driven FNV/xorshift sponge: enough - * avalanche to distinguish inputs in tests, but cryptographically worthless. */ - -#define DIST_B2_FNV_OFFSET 0x100000001b3cbf29ULL -#define DIST_B2_FNV_PRIME 0x00000100000001b3ULL - -static uint64_t b2_absorb(const uint8_t* data, size_t len, uint64_t h) { - size_t i; - for (i = 0; i < len; ++i) { - h ^= data[i]; - h *= DIST_B2_FNV_PRIME; - h ^= h >> 31; - } - return h; -} - void dist_blake2b_init(DistBlake2b* s, size_t out_len) { - s->h = DIST_B2_FNV_OFFSET ^ (uint64_t)out_len; - s->len = 0; - s->out_len = out_len; + crypto_blake2b_init(&s->ctx, out_len); } void dist_blake2b_update(DistBlake2b* s, const uint8_t* data, size_t len) { - s->h = b2_absorb(data, len, s->h); - s->len += (uint64_t)len; + crypto_blake2b_update(&s->ctx, data, len); } void dist_blake2b_final(DistBlake2b* s, uint8_t* out) { - uint64_t base = s->h ^ (s->len * 0x9e3779b97f4a7c15ULL); - size_t i; - for (i = 0; i < s->out_len; ++i) { - uint64_t h = base ^ (0xc2b2ae3d27d4eb4fULL * (uint64_t)(i + 1)); - h *= DIST_B2_FNV_PRIME; - h ^= h >> 27; - out[i] = (uint8_t)(h >> 32); - } + crypto_blake2b_final(&s->ctx, out); } void dist_blake2b(uint8_t out[DIST_BLAKE2B_LEN], const uint8_t* data, size_t len) { - DistBlake2b s; - dist_blake2b_init(&s, DIST_BLAKE2B_LEN); - dist_blake2b_update(&s, data, len); - dist_blake2b_final(&s, out); + crypto_blake2b(out, DIST_BLAKE2B_LEN, data, len); } diff --git a/driver/dist/blake2b.h b/driver/dist/blake2b.h @@ -5,16 +5,12 @@ #include <stdint.h> #include "dist.h" +#include "vendor/monocypher/monocypher.h" -/* minisign prehash and v2 package/content hash. Real minisign hashes the - * signed file with BLAKE2b-512 and signs the 64-byte digest. - * - * STUB: deterministic non-cryptographic 64-byte digest. Not BLAKE2b. Replace - * with the real implementation when vendoring crypto. */ +/* v2 package/content hash. Minisign signatures use their own 64-byte BLAKE2b + * prehash path in minisig.c for stock minisign compatibility. */ typedef struct DistBlake2b { - uint64_t h; - uint64_t len; - size_t out_len; + crypto_blake2b_ctx ctx; } DistBlake2b; void dist_blake2b_init(DistBlake2b* s, size_t out_len); diff --git a/driver/dist/cfpkg.c b/driver/dist/cfpkg.c @@ -249,8 +249,8 @@ void dist_cfpkg_encode_index_record(uint8_t out[DIST_CFPKG_INDEX_RECORD_SIZE], put_u64le(out + 32, r->raw_size); put_u32le(out + 40, r->compression); memcpy(out + 48, r->stored_hash, DIST_BLAKE2B_LEN); - memcpy(out + 112, r->raw_hash, DIST_BLAKE2B_LEN); - memcpy(out + 176, r->leaf_hash, DIST_BLAKE2B_LEN); + memcpy(out + 80, r->raw_hash, DIST_BLAKE2B_LEN); + memcpy(out + 112, r->leaf_hash, DIST_BLAKE2B_LEN); } int dist_cfpkg_decode_index_record(const uint8_t* data, size_t len, @@ -263,8 +263,8 @@ int dist_cfpkg_decode_index_record(const uint8_t* data, size_t len, r->raw_size = get_u64le(data + 32); r->compression = get_u32le(data + 40); memcpy(r->stored_hash, data + 48, DIST_BLAKE2B_LEN); - memcpy(r->raw_hash, data + 112, DIST_BLAKE2B_LEN); - memcpy(r->leaf_hash, data + 176, DIST_BLAKE2B_LEN); + memcpy(r->raw_hash, data + 80, DIST_BLAKE2B_LEN); + memcpy(r->leaf_hash, data + 112, DIST_BLAKE2B_LEN); return DIST_OK; } diff --git a/driver/dist/cfpkg.h b/driver/dist/cfpkg.h @@ -12,7 +12,7 @@ #define DIST_CFPKG_HEADER_SIZE 160u #define DIST_CFPKG_ALIGNMENT 16u #define DIST_CFPKG_CHUNK_SIZE_DEFAULT 65536u -#define DIST_CFPKG_INDEX_RECORD_SIZE 240u +#define DIST_CFPKG_INDEX_RECORD_SIZE 144u typedef enum DistCfpkgCompression { DIST_CFPKG_COMP_NONE = 0, diff --git a/driver/dist/deflate.c b/driver/dist/deflate.c @@ -2,13 +2,1956 @@ #include <string.h> +/* + * Private raw DEFLATE/INFLATE codec. + * + * Extracted from miniz (public domain), via xOS. Provides the core DEFLATE + * algorithm without malloc, file I/O, or ZIP support. + * + * Features: + * - No dynamic allocation (all state in user-provided structures) + * - No file I/O + * - Supports raw deflate and zlib-wrapped streams + * + * Structure sizes: + * - xdeflate_compressor: ~285 KB (or ~140 KB with XDEFLATE_LESS_MEMORY=1) + * - xdeflate_decompressor: ~11 KB + */ + +/* ============================================================================ + * Configuration + * ============================================================================ + */ + +/* Set to 1 to reduce compressor memory usage (slightly slower) */ +#ifndef XDEFLATE_LESS_MEMORY +#define XDEFLATE_LESS_MEMORY 1 +#endif + +/* Use 64-bit bit buffer on 64-bit platforms */ +#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \ + defined(__LP64__) +#define XDEFLATE_HAS_64BIT_REGISTERS 1 +#else +#define XDEFLATE_HAS_64BIT_REGISTERS 0 +#endif + +/* ============================================================================ + * Compression API + * ============================================================================ + */ + +/* Compression flags (OR together) */ +enum { + XDEFLATE_HUFFMAN_ONLY = 0, /* No LZ77, Huffman only */ + XDEFLATE_DEFAULT_MAX_PROBES = 128, /* Default hash probe depth */ + XDEFLATE_MAX_PROBES_MASK = 0xFFF, /* Max probes in low 12 bits */ + + XDEFLATE_WRITE_ZLIB_HEADER = 0x01000, /* Output zlib header + adler32 */ + XDEFLATE_COMPUTE_ADLER32 = 0x02000, /* Compute adler32 even without header */ + XDEFLATE_GREEDY_PARSING = 0x04000, /* Fast greedy vs lazy parsing */ + XDEFLATE_NONDETERMINISTIC = + 0x08000, /* Faster init, non-deterministic output */ + XDEFLATE_RLE_MATCHES = 0x10000, /* Only distance-1 matches */ + XDEFLATE_FILTER_MATCHES = 0x20000, /* Skip matches <= 5 chars */ + XDEFLATE_FORCE_STATIC_BLOCKS = 0x40000, + XDEFLATE_FORCE_RAW_BLOCKS = 0x80000 +}; + +/* Compression status */ +typedef enum { + XDEFLATE_STATUS_BAD_PARAM = -2, + XDEFLATE_STATUS_PUT_BUF_FAILED = -1, + XDEFLATE_STATUS_OKAY = 0, + XDEFLATE_STATUS_DONE = 1 +} xdeflate_status; + +/* Flush modes */ +typedef enum { + XDEFLATE_NO_FLUSH = 0, + XDEFLATE_SYNC_FLUSH = 2, + XDEFLATE_FULL_FLUSH = 3, + XDEFLATE_FINISH = 4 +} xdeflate_flush; + +/* Internal constants */ +enum { + XDEFLATE_MAX_HUFF_TABLES = 3, + XDEFLATE_MAX_HUFF_SYMBOLS_0 = 288, + XDEFLATE_MAX_HUFF_SYMBOLS_1 = 32, + XDEFLATE_MAX_HUFF_SYMBOLS_2 = 19, + XDEFLATE_MAX_HUFF_SYMBOLS = 288, + XDEFLATE_LZ_DICT_SIZE = 32768, + XDEFLATE_LZ_DICT_SIZE_MASK = XDEFLATE_LZ_DICT_SIZE - 1, + XDEFLATE_MIN_MATCH_LEN = 3, + XDEFLATE_MAX_MATCH_LEN = 258 +}; + +#if XDEFLATE_LESS_MEMORY +enum { + XDEFLATE_LZ_CODE_BUF_SIZE = 24 * 1024, + XDEFLATE_OUT_BUF_SIZE = (XDEFLATE_LZ_CODE_BUF_SIZE * 13) / 10, + XDEFLATE_LZ_HASH_BITS = 12, + XDEFLATE_LEVEL1_HASH_SIZE_MASK = 4095, + XDEFLATE_LZ_HASH_SHIFT = (XDEFLATE_LZ_HASH_BITS + 2) / 3, + XDEFLATE_LZ_HASH_SIZE = 1 << XDEFLATE_LZ_HASH_BITS +}; +#else +enum { + XDEFLATE_LZ_CODE_BUF_SIZE = 64 * 1024, + XDEFLATE_OUT_BUF_SIZE = (XDEFLATE_LZ_CODE_BUF_SIZE * 13) / 10, + XDEFLATE_LZ_HASH_BITS = 15, + XDEFLATE_LEVEL1_HASH_SIZE_MASK = 4095, + XDEFLATE_LZ_HASH_SHIFT = (XDEFLATE_LZ_HASH_BITS + 2) / 3, + XDEFLATE_LZ_HASH_SIZE = 1 << XDEFLATE_LZ_HASH_BITS +}; +#endif + +/* Output callback type */ +typedef int (*xdeflate_put_buf_func)(const void* buf, int len, void* user); + +/* Compressor state (~285 KB or ~140 KB with XDEFLATE_LESS_MEMORY) */ +typedef struct { + xdeflate_put_buf_func m_pPut_buf_func; + void* m_pPut_buf_user; + uint32_t m_flags, m_max_probes[2]; + int m_greedy_parsing; + uint32_t m_adler32, m_lookahead_pos, m_lookahead_size, m_dict_size; + uint8_t *m_pLZ_code_buf, *m_pLZ_flags, *m_pOutput_buf, *m_pOutput_buf_end; + uint32_t m_num_flags_left, m_total_lz_bytes, m_lz_code_buf_dict_pos; + uint32_t m_bits_in, m_bit_buffer; + uint32_t m_saved_match_dist, m_saved_match_len, m_saved_lit; + uint32_t m_output_flush_ofs, m_output_flush_remaining; + uint32_t m_finished, m_block_index, m_wants_to_finish; + xdeflate_status m_prev_return_status; + const void* m_pIn_buf; + void* m_pOut_buf; + size_t *m_pIn_buf_size, *m_pOut_buf_size; + xdeflate_flush m_flush; + const uint8_t* m_pSrc; + size_t m_src_buf_left, m_out_buf_ofs; + uint8_t m_dict[XDEFLATE_LZ_DICT_SIZE + XDEFLATE_MAX_MATCH_LEN - 1]; + uint16_t m_huff_count[XDEFLATE_MAX_HUFF_TABLES][XDEFLATE_MAX_HUFF_SYMBOLS]; + uint16_t m_huff_codes[XDEFLATE_MAX_HUFF_TABLES][XDEFLATE_MAX_HUFF_SYMBOLS]; + uint8_t m_huff_code_sizes[XDEFLATE_MAX_HUFF_TABLES] + [XDEFLATE_MAX_HUFF_SYMBOLS]; + uint8_t m_lz_code_buf[XDEFLATE_LZ_CODE_BUF_SIZE]; + uint16_t m_next[XDEFLATE_LZ_DICT_SIZE]; + uint16_t m_hash[XDEFLATE_LZ_HASH_SIZE]; + uint8_t m_output_buf[XDEFLATE_OUT_BUF_SIZE]; +} xdeflate_compressor; + +/* + * xdeflate_init - Initialize compressor + * + * @d: Compressor state (caller-provided) + * @put_buf_func: Output callback (NULL to use xdeflate_compress with output + * buffer) + * @put_buf_user: User data for callback + * @flags: Compression flags (XDEFLATE_* ORed together) + * + * Returns XDEFLATE_STATUS_OKAY on success. + * No corresponding deinit needed - no allocations made. + */ +xdeflate_status xdeflate_init(xdeflate_compressor* d, + xdeflate_put_buf_func put_buf_func, + void* put_buf_user, int flags); + +/* + * xdeflate_compress - Compress data + * + * @d: Compressor state + * @in_buf: Input data + * @in_buf_size: Input size (updated to bytes consumed) + * @out_buf: Output buffer + * @out_buf_size: Output capacity (updated to bytes written) + * @flush: Flush mode + * + * Returns status code. + */ +xdeflate_status xdeflate_compress(xdeflate_compressor* d, const void* in_buf, + size_t* in_buf_size, void* out_buf, + size_t* out_buf_size, xdeflate_flush flush); + +/* + * xdeflate_get_adler32 - Get current adler32 checksum + */ +uint32_t xdeflate_get_adler32(xdeflate_compressor* d); + +/* + * xdeflate_create_flags - Create compression flags from zlib-style parameters + * + * @level: 0-10 (0=store, 1=fastest, 10=best) + * @window_bits: -15 (raw deflate) or 15 (zlib header) + * @strategy: 0=default, 1=filtered, 2=huffman, 3=rle, 4=fixed + */ +uint32_t xdeflate_create_flags(int level, int window_bits, int strategy); + +/* ============================================================================ + * Decompression API + * ============================================================================ + */ + +/* Decompression flags */ +enum { + XINFLATE_FLAG_PARSE_ZLIB_HEADER = 1, /* Input has zlib header */ + XINFLATE_FLAG_HAS_MORE_INPUT = 2, /* More input available */ + XINFLATE_FLAG_USING_NON_WRAPPING_OUTPUT_BUF = + 4, /* Output buffer >= uncompressed size */ + XINFLATE_FLAG_COMPUTE_ADLER32 = 8 /* Compute adler32 checksum */ +}; + +/* Decompression status */ +typedef enum { + XINFLATE_STATUS_FAILED_CANNOT_MAKE_PROGRESS = -4, + XINFLATE_STATUS_BAD_PARAM = -3, + XINFLATE_STATUS_ADLER32_MISMATCH = -2, + XINFLATE_STATUS_FAILED = -1, + XINFLATE_STATUS_DONE = 0, + XINFLATE_STATUS_NEEDS_MORE_INPUT = 1, + XINFLATE_STATUS_HAS_MORE_OUTPUT = 2 +} xinflate_status; + +/* Internal constants */ +enum { + XINFLATE_MAX_HUFF_TABLES = 3, + XINFLATE_MAX_HUFF_SYMBOLS_0 = 288, + XINFLATE_MAX_HUFF_SYMBOLS_1 = 32, + XINFLATE_MAX_HUFF_SYMBOLS_2 = 19, + XINFLATE_FAST_LOOKUP_BITS = 10, + XINFLATE_FAST_LOOKUP_SIZE = 1 << XINFLATE_FAST_LOOKUP_BITS, + XINFLATE_LZ_DICT_SIZE = 32768 +}; + +#if XDEFLATE_HAS_64BIT_REGISTERS +typedef uint64_t xinflate_bit_buf_t; +#define XINFLATE_BITBUF_SIZE 64 +#else +typedef uint32_t xinflate_bit_buf_t; +#define XINFLATE_BITBUF_SIZE 32 +#endif + +/* Decompressor state (~11 KB) */ +typedef struct { + uint32_t m_state, m_num_bits, m_zhdr0, m_zhdr1; + uint32_t m_z_adler32, m_final, m_type, m_check_adler32; + uint32_t m_dist, m_counter, m_num_extra; + uint32_t m_table_sizes[XINFLATE_MAX_HUFF_TABLES]; + xinflate_bit_buf_t m_bit_buf; + size_t m_dist_from_out_buf_start; + int16_t m_look_up[XINFLATE_MAX_HUFF_TABLES][XINFLATE_FAST_LOOKUP_SIZE]; + int16_t m_tree_0[XINFLATE_MAX_HUFF_SYMBOLS_0 * 2]; + int16_t m_tree_1[XINFLATE_MAX_HUFF_SYMBOLS_1 * 2]; + int16_t m_tree_2[XINFLATE_MAX_HUFF_SYMBOLS_2 * 2]; + uint8_t m_code_size_0[XINFLATE_MAX_HUFF_SYMBOLS_0]; + uint8_t m_code_size_1[XINFLATE_MAX_HUFF_SYMBOLS_1]; + uint8_t m_code_size_2[XINFLATE_MAX_HUFF_SYMBOLS_2]; + uint8_t m_raw_header[4]; + uint8_t m_len_codes[XINFLATE_MAX_HUFF_SYMBOLS_0 + + XINFLATE_MAX_HUFF_SYMBOLS_1 + 137]; +} xinflate_decompressor; + +/* Initialize decompressor */ +#define xinflate_init(r) \ + do { \ + (r)->m_state = 0; \ + } while (0) + +/* Get adler32 after decompression */ +#define xinflate_get_adler32(r) ((r)->m_check_adler32) + +/* + * xinflate_decompress - Decompress data + * + * @r: Decompressor state + * @in_buf_next: Input buffer pointer + * @in_buf_size: Input size (updated to bytes consumed) + * @out_buf_start: Start of output buffer (for dictionary lookback) + * @out_buf_next: Current output position + * @out_buf_size: Output capacity (updated to bytes written) + * @flags: Decompression flags + * + * Returns status code. + */ +xinflate_status xinflate_decompress(xinflate_decompressor* r, + const uint8_t* in_buf_next, + size_t* in_buf_size, uint8_t* out_buf_start, + uint8_t* out_buf_next, size_t* out_buf_size, + uint32_t flags); + +/* ============================================================================ + * Convenience Functions + * ============================================================================ + */ + +#define XINFLATE_DECOMPRESS_MEM_TO_MEM_FAILED ((size_t)(-1)) + +/* + * xinflate_decompress_mem_to_mem - Decompress memory to memory + * + * @out_buf: Output buffer + * @out_buf_len: Output buffer size + * @src_buf: Compressed input + * @src_buf_len: Input size + * @flags: Decompression flags + * + * Returns bytes written, or XINFLATE_DECOMPRESS_MEM_TO_MEM_FAILED on error. + */ +size_t xinflate_decompress_mem_to_mem(void* out_buf, size_t out_buf_len, + const void* src_buf, size_t src_buf_len, + int flags); + +/* ============================================================================ + * Internal Macros + * ============================================================================ + */ + +#define XDEFLATE_MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define XDEFLATE_MAX(a, b) (((a) > (b)) ? (a) : (b)) + +#define XDEFLATE_CLEAR_ARR(arr) memset(arr, 0, sizeof(arr)) +#define XDEFLATE_CLEAR_OBJ(obj) memset(&(obj), 0, sizeof(obj)) + +#define XDEFLATE_ASSERT(x) ((void)(x)) + +#define XDEFLATE_MACRO_END while (0) + +#if defined(_MSC_VER) +#define XDEFLATE_FORCEINLINE __forceinline +#elif defined(__GNUC__) || defined(__clang__) +#define XDEFLATE_FORCEINLINE static inline __attribute__((always_inline)) +#else +#define XDEFLATE_FORCEINLINE static inline +#endif + +/* Detect platform capabilities */ +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__) +#define XDEFLATE_X86_OR_X64 1 +#else +#define XDEFLATE_X86_OR_X64 0 +#endif + +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) +#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define XDEFLATE_LITTLE_ENDIAN 1 +#else +#define XDEFLATE_LITTLE_ENDIAN 0 +#endif +#elif XDEFLATE_X86_OR_X64 +#define XDEFLATE_LITTLE_ENDIAN 1 +#else +#define XDEFLATE_LITTLE_ENDIAN 0 +#endif + +/* Allow unaligned loads on x86/x64 */ +#if XDEFLATE_X86_OR_X64 +#define XDEFLATE_USE_UNALIGNED_LOADS 1 +#else +#define XDEFLATE_USE_UNALIGNED_LOADS 0 +#endif + +/* Use memcpy for unaligned access (safer, usually optimized away) */ +#define XDEFLATE_UNALIGNED_USE_MEMCPY 1 + +/* Read/checksum helpers */ +#if !XDEFLATE_HAS_64BIT_REGISTERS +static uint16_t xdeflate_read_le16(const uint8_t* p) { + return (uint16_t)(p[0] | ((uint16_t)p[1] << 8)); +} +#endif + +static uint32_t xdeflate_read_le32(const uint8_t* p) { + return (uint32_t)p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24); +} + +static uint32_t xdeflate_adler32_update(uint32_t adler, const uint8_t* data, + size_t len) { + uint32_t s1 = adler & 0xffffu; + uint32_t s2 = adler >> 16; + while (len) { + size_t block = len < 5552u ? len : 5552u; + size_t i; + for (i = 0; i < block; ++i) { + s1 += data[i]; + s2 += s1; + } + s1 %= 65521u; + s2 %= 65521u; + data += block; + len -= block; + } + return (s2 << 16) | s1; +} + +/* ============================================================================ + * Compression Implementation + * ============================================================================ + */ + +/* Lookup tables for length/distance encoding */ +static const uint16_t s_len_sym[256] = { + 257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, 266, 267, 267, 268, + 268, 269, 269, 269, 269, 270, 270, 270, 270, 271, 271, 271, 271, 272, 272, + 272, 272, 273, 273, 273, 273, 273, 273, 273, 273, 274, 274, 274, 274, 274, + 274, 274, 274, 275, 275, 275, 275, 275, 275, 275, 275, 276, 276, 276, 276, + 276, 276, 276, 276, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, + 278, 278, 278, 278, 278, 278, 279, 279, 279, 279, 279, 279, 279, 279, 279, + 279, 279, 279, 279, 279, 279, 279, 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, 281, 281, 281, 281, 281, 281, 281, + 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, + 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 282, 282, 282, 282, 282, + 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, + 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 283, 283, 283, + 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, + 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 284, + 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, + 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, + 285}; + +static const uint8_t s_len_extra[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0}; + +static const uint8_t s_small_dist_sym[512] = { + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, + 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; + +static const uint8_t s_small_dist_extra[512] = { + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + +static const uint8_t s_large_dist_sym[128] = { + 0, 0, 18, 19, 20, 20, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, + 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; + +static const uint8_t s_large_dist_extra[128] = { + 0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; + +static const uint32_t s_bitmasks[17] = { + 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, + 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF}; + +static const uint32_t s_num_probes[11] = {0, 1, 6, 32, 16, 32, + 128, 256, 512, 768, 1500}; + +/* Symbol frequency for radix sort */ +typedef struct { + uint16_t m_key, m_sym_index; +} sym_freq; + +static sym_freq* radix_sort_syms(uint32_t num_syms, sym_freq* pSyms0, + sym_freq* pSyms1) { + uint32_t total_passes = 2, pass_shift, pass, i, hist[256 * 2]; + sym_freq *pCur_syms = pSyms0, *pNew_syms = pSyms1; + XDEFLATE_CLEAR_ARR(hist); + for (i = 0; i < num_syms; i++) { + uint32_t freq = pSyms0[i].m_key; + hist[freq & 0xFF]++; + hist[256 + ((freq >> 8) & 0xFF)]++; + } + while ((total_passes > 1) && (num_syms == hist[(total_passes - 1) * 256])) + total_passes--; + for (pass_shift = 0, pass = 0; pass < total_passes; pass++, pass_shift += 8) { + const uint32_t* pHist = &hist[pass << 8]; + uint32_t offsets[256], cur_ofs = 0; + for (i = 0; i < 256; i++) { + offsets[i] = cur_ofs; + cur_ofs += pHist[i]; + } + for (i = 0; i < num_syms; i++) + pNew_syms[offsets[(pCur_syms[i].m_key >> pass_shift) & 0xFF]++] = + pCur_syms[i]; + { + sym_freq* t = pCur_syms; + pCur_syms = pNew_syms; + pNew_syms = t; + } + } + return pCur_syms; +} + +static void calculate_minimum_redundancy(sym_freq* A, int n) { + int root, leaf, next, avbl, used, dpth; + if (n == 0) + return; + else if (n == 1) { + A[0].m_key = 1; + return; + } + A[0].m_key += A[1].m_key; + root = 0; + leaf = 2; + for (next = 1; next < n - 1; next++) { + if (leaf >= n || A[root].m_key < A[leaf].m_key) { + A[next].m_key = A[root].m_key; + A[root++].m_key = (uint16_t)next; + } else + A[next].m_key = A[leaf++].m_key; + if (leaf >= n || (root < next && A[root].m_key < A[leaf].m_key)) { + A[next].m_key = (uint16_t)(A[next].m_key + A[root].m_key); + A[root++].m_key = (uint16_t)next; + } else + A[next].m_key = (uint16_t)(A[next].m_key + A[leaf++].m_key); + } + A[n - 2].m_key = 0; + for (next = n - 3; next >= 0; next--) + A[next].m_key = A[A[next].m_key].m_key + 1; + avbl = 1; + used = dpth = 0; + root = n - 2; + next = n - 1; + while (avbl > 0) { + while (root >= 0 && (int)A[root].m_key == dpth) { + used++; + root--; + } + while (avbl > used) { + A[next--].m_key = (uint16_t)(dpth); + avbl--; + } + avbl = 2 * used; + dpth++; + used = 0; + } +} + +#define MAX_SUPPORTED_HUFF_CODESIZE 32 + +static void huffman_enforce_max_code_size(int* pNum_codes, int code_list_len, + int max_code_size) { + int i; + uint32_t total = 0; + if (code_list_len <= 1) return; + for (i = max_code_size + 1; i <= MAX_SUPPORTED_HUFF_CODESIZE; i++) + pNum_codes[max_code_size] += pNum_codes[i]; + for (i = max_code_size; i > 0; i--) + total += (((uint32_t)pNum_codes[i]) << (max_code_size - i)); + while (total != (1UL << max_code_size)) { + pNum_codes[max_code_size]--; + for (i = max_code_size - 1; i > 0; i--) + if (pNum_codes[i]) { + pNum_codes[i]--; + pNum_codes[i + 1] += 2; + break; + } + total--; + } +} + +static void optimize_huffman_table(xdeflate_compressor* d, int table_num, + int table_len, int code_size_limit, + int static_table) { + int i, j, l, num_codes[1 + MAX_SUPPORTED_HUFF_CODESIZE]; + uint32_t next_code[MAX_SUPPORTED_HUFF_CODESIZE + 1]; + XDEFLATE_CLEAR_ARR(num_codes); + if (static_table) { + for (i = 0; i < table_len; i++) + num_codes[d->m_huff_code_sizes[table_num][i]]++; + } else { + sym_freq syms0[XDEFLATE_MAX_HUFF_SYMBOLS], syms1[XDEFLATE_MAX_HUFF_SYMBOLS], + *pSyms; + int num_used_syms = 0; + const uint16_t* pSym_count = &d->m_huff_count[table_num][0]; + for (i = 0; i < table_len; i++) + if (pSym_count[i]) { + syms0[num_used_syms].m_key = (uint16_t)pSym_count[i]; + syms0[num_used_syms++].m_sym_index = (uint16_t)i; + } + + pSyms = radix_sort_syms(num_used_syms, syms0, syms1); + calculate_minimum_redundancy(pSyms, num_used_syms); + + for (i = 0; i < num_used_syms; i++) num_codes[pSyms[i].m_key]++; + + huffman_enforce_max_code_size(num_codes, num_used_syms, code_size_limit); + + XDEFLATE_CLEAR_ARR(d->m_huff_code_sizes[table_num]); + XDEFLATE_CLEAR_ARR(d->m_huff_codes[table_num]); + for (i = 1, j = num_used_syms; i <= code_size_limit; i++) + for (l = num_codes[i]; l > 0; l--) + d->m_huff_code_sizes[table_num][pSyms[--j].m_sym_index] = (uint8_t)(i); + } + + next_code[1] = 0; + for (j = 0, i = 2; i <= code_size_limit; i++) + next_code[i] = j = ((j + num_codes[i - 1]) << 1); + + for (i = 0; i < table_len; i++) { + uint32_t rev_code = 0, code, code_size; + if ((code_size = d->m_huff_code_sizes[table_num][i]) == 0) continue; + code = next_code[code_size]++; + for (l = code_size; l > 0; l--, code >>= 1) + rev_code = (rev_code << 1) | (code & 1); + d->m_huff_codes[table_num][i] = (uint16_t)rev_code; + } +} + +#define PUT_BITS(b, l) \ + do { \ + uint32_t bits = b; \ + uint32_t len = l; \ + XDEFLATE_ASSERT(bits <= ((1U << len) - 1U)); \ + d->m_bit_buffer |= (bits << d->m_bits_in); \ + d->m_bits_in += len; \ + while (d->m_bits_in >= 8) { \ + if (d->m_pOutput_buf < d->m_pOutput_buf_end) \ + *d->m_pOutput_buf++ = (uint8_t)(d->m_bit_buffer); \ + d->m_bit_buffer >>= 8; \ + d->m_bits_in -= 8; \ + } \ + } \ + XDEFLATE_MACRO_END + +#define RLE_PREV_CODE_SIZE() \ + { \ + if (rle_repeat_count) { \ + if (rle_repeat_count < 3) { \ + d->m_huff_count[2][prev_code_size] = \ + (uint16_t)(d->m_huff_count[2][prev_code_size] + rle_repeat_count); \ + while (rle_repeat_count--) \ + packed_code_sizes[num_packed_code_sizes++] = prev_code_size; \ + } else { \ + d->m_huff_count[2][16] = (uint16_t)(d->m_huff_count[2][16] + 1); \ + packed_code_sizes[num_packed_code_sizes++] = 16; \ + packed_code_sizes[num_packed_code_sizes++] = \ + (uint8_t)(rle_repeat_count - 3); \ + } \ + rle_repeat_count = 0; \ + } \ + } + +#define RLE_ZERO_CODE_SIZE() \ + { \ + if (rle_z_count) { \ + if (rle_z_count < 3) { \ + d->m_huff_count[2][0] = \ + (uint16_t)(d->m_huff_count[2][0] + rle_z_count); \ + while (rle_z_count--) packed_code_sizes[num_packed_code_sizes++] = 0; \ + } else if (rle_z_count <= 10) { \ + d->m_huff_count[2][17] = (uint16_t)(d->m_huff_count[2][17] + 1); \ + packed_code_sizes[num_packed_code_sizes++] = 17; \ + packed_code_sizes[num_packed_code_sizes++] = \ + (uint8_t)(rle_z_count - 3); \ + } else { \ + d->m_huff_count[2][18] = (uint16_t)(d->m_huff_count[2][18] + 1); \ + packed_code_sizes[num_packed_code_sizes++] = 18; \ + packed_code_sizes[num_packed_code_sizes++] = \ + (uint8_t)(rle_z_count - 11); \ + } \ + rle_z_count = 0; \ + } \ + } + +static const uint8_t s_packed_code_size_syms_swizzle[] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + +static void start_dynamic_block(xdeflate_compressor* d) { + int num_lit_codes, num_dist_codes, num_bit_lengths; + uint32_t i, total_code_sizes_to_pack, num_packed_code_sizes, rle_z_count, + rle_repeat_count, packed_code_sizes_index; + uint8_t code_sizes_to_pack[XDEFLATE_MAX_HUFF_SYMBOLS_0 + + XDEFLATE_MAX_HUFF_SYMBOLS_1]; + uint8_t packed_code_sizes[XDEFLATE_MAX_HUFF_SYMBOLS_0 + + XDEFLATE_MAX_HUFF_SYMBOLS_1]; + uint8_t prev_code_size = 0xFF; + + d->m_huff_count[0][256] = 1; + + optimize_huffman_table(d, 0, XDEFLATE_MAX_HUFF_SYMBOLS_0, 15, 0); + optimize_huffman_table(d, 1, XDEFLATE_MAX_HUFF_SYMBOLS_1, 15, 0); + + for (num_lit_codes = 286; num_lit_codes > 257; num_lit_codes--) + if (d->m_huff_code_sizes[0][num_lit_codes - 1]) break; + for (num_dist_codes = 30; num_dist_codes > 1; num_dist_codes--) + if (d->m_huff_code_sizes[1][num_dist_codes - 1]) break; + + memcpy(code_sizes_to_pack, &d->m_huff_code_sizes[0][0], num_lit_codes); + memcpy(code_sizes_to_pack + num_lit_codes, &d->m_huff_code_sizes[1][0], + num_dist_codes); + total_code_sizes_to_pack = num_lit_codes + num_dist_codes; + num_packed_code_sizes = 0; + rle_z_count = 0; + rle_repeat_count = 0; + + memset(&d->m_huff_count[2][0], 0, + sizeof(d->m_huff_count[2][0]) * XDEFLATE_MAX_HUFF_SYMBOLS_2); + for (i = 0; i < total_code_sizes_to_pack; i++) { + uint8_t code_size = code_sizes_to_pack[i]; + if (!code_size) { + RLE_PREV_CODE_SIZE(); + if (++rle_z_count == 138) { + RLE_ZERO_CODE_SIZE(); + } + } else { + RLE_ZERO_CODE_SIZE(); + if (code_size != prev_code_size) { + RLE_PREV_CODE_SIZE(); + d->m_huff_count[2][code_size] = + (uint16_t)(d->m_huff_count[2][code_size] + 1); + packed_code_sizes[num_packed_code_sizes++] = code_size; + } else if (++rle_repeat_count == 6) { + RLE_PREV_CODE_SIZE(); + } + } + prev_code_size = code_size; + } + if (rle_repeat_count) { + RLE_PREV_CODE_SIZE(); + } else { + RLE_ZERO_CODE_SIZE(); + } + + optimize_huffman_table(d, 2, XDEFLATE_MAX_HUFF_SYMBOLS_2, 7, 0); + + PUT_BITS(2, 2); + + PUT_BITS(num_lit_codes - 257, 5); + PUT_BITS(num_dist_codes - 1, 5); + + for (num_bit_lengths = 18; num_bit_lengths >= 0; num_bit_lengths--) + if (d->m_huff_code_sizes[2] + [s_packed_code_size_syms_swizzle[num_bit_lengths]]) + break; + num_bit_lengths = XDEFLATE_MAX(4, (num_bit_lengths + 1)); + PUT_BITS(num_bit_lengths - 4, 4); + for (i = 0; (int)i < num_bit_lengths; i++) + PUT_BITS(d->m_huff_code_sizes[2][s_packed_code_size_syms_swizzle[i]], 3); + + for (packed_code_sizes_index = 0; + packed_code_sizes_index < num_packed_code_sizes;) { + uint32_t code = packed_code_sizes[packed_code_sizes_index++]; + XDEFLATE_ASSERT(code < XDEFLATE_MAX_HUFF_SYMBOLS_2); + PUT_BITS(d->m_huff_codes[2][code], d->m_huff_code_sizes[2][code]); + if (code >= 16) + PUT_BITS(packed_code_sizes[packed_code_sizes_index++], + "\02\03\07"[code - 16]); + } +} + +static void start_static_block(xdeflate_compressor* d) { + uint32_t i; + uint8_t* p = &d->m_huff_code_sizes[0][0]; + + for (i = 0; i <= 143; ++i) *p++ = 8; + for (; i <= 255; ++i) *p++ = 9; + for (; i <= 279; ++i) *p++ = 7; + for (; i <= 287; ++i) *p++ = 8; + + memset(d->m_huff_code_sizes[1], 5, 32); + + optimize_huffman_table(d, 0, 288, 15, 1); + optimize_huffman_table(d, 1, 32, 15, 1); + + PUT_BITS(1, 2); +} + +static int compress_lz_codes(xdeflate_compressor* d) { + uint32_t flags; + uint8_t* pLZ_codes; + + flags = 1; + for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < d->m_pLZ_code_buf; + flags >>= 1) { + if (flags == 1) flags = *pLZ_codes++ | 0x100; + if (flags & 1) { + uint32_t sym, num_extra_bits; + uint32_t match_len = pLZ_codes[0], + match_dist = (pLZ_codes[1] | (pLZ_codes[2] << 8)); + pLZ_codes += 3; + + XDEFLATE_ASSERT(d->m_huff_code_sizes[0][s_len_sym[match_len]]); + PUT_BITS(d->m_huff_codes[0][s_len_sym[match_len]], + d->m_huff_code_sizes[0][s_len_sym[match_len]]); + PUT_BITS(match_len & s_bitmasks[s_len_extra[match_len]], + s_len_extra[match_len]); + + if (match_dist < 512) { + sym = s_small_dist_sym[match_dist]; + num_extra_bits = s_small_dist_extra[match_dist]; + } else { + sym = s_large_dist_sym[match_dist >> 8]; + num_extra_bits = s_large_dist_extra[match_dist >> 8]; + } + XDEFLATE_ASSERT(d->m_huff_code_sizes[1][sym]); + PUT_BITS(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]); + PUT_BITS(match_dist & s_bitmasks[num_extra_bits], num_extra_bits); + } else { + uint32_t lit = *pLZ_codes++; + XDEFLATE_ASSERT(d->m_huff_code_sizes[0][lit]); + PUT_BITS(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]); + } + } + + PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]); + + return (d->m_pOutput_buf < d->m_pOutput_buf_end); +} + +static int compress_block(xdeflate_compressor* d, int static_block) { + if (static_block) + start_static_block(d); + else + start_dynamic_block(d); + return compress_lz_codes(d); +} + +static int flush_block(xdeflate_compressor* d, int flush) { + uint32_t saved_bit_buf, saved_bits_in; + uint8_t* pSaved_output_buf; + int comp_block_succeeded = 0; + int n, use_raw_block = + ((d->m_flags & XDEFLATE_FORCE_RAW_BLOCKS) != 0) && + (d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size; + uint8_t* pOutput_buf_start = + ((d->m_pPut_buf_func == NULL) && + ((*d->m_pOut_buf_size - d->m_out_buf_ofs) >= XDEFLATE_OUT_BUF_SIZE)) + ? ((uint8_t*)d->m_pOut_buf + d->m_out_buf_ofs) + : d->m_output_buf; + + d->m_pOutput_buf = pOutput_buf_start; + d->m_pOutput_buf_end = d->m_pOutput_buf + XDEFLATE_OUT_BUF_SIZE - 16; + + XDEFLATE_ASSERT(!d->m_output_flush_remaining); + d->m_output_flush_ofs = 0; + d->m_output_flush_remaining = 0; + + *d->m_pLZ_flags = (uint8_t)(*d->m_pLZ_flags >> d->m_num_flags_left); + d->m_pLZ_code_buf -= (d->m_num_flags_left == 8); + + if ((d->m_flags & XDEFLATE_WRITE_ZLIB_HEADER) && (!d->m_block_index)) { + const uint8_t cmf = 0x78; + uint8_t flg, flevel = 3; + uint32_t header, i, mz_un = sizeof(s_num_probes) / sizeof(uint32_t); + + for (i = 0; i < mz_un; i++) + if (s_num_probes[i] == (d->m_flags & 0xFFF)) break; + + if (i < 2) + flevel = 0; + else if (i < 6) + flevel = 1; + else if (i == 6) + flevel = 2; + + header = cmf << 8 | (flevel << 6); + header += 31 - (header % 31); + flg = header & 0xFF; + + PUT_BITS(cmf, 8); + PUT_BITS(flg, 8); + } + + PUT_BITS(flush == XDEFLATE_FINISH, 1); + + pSaved_output_buf = d->m_pOutput_buf; + saved_bit_buf = d->m_bit_buffer; + saved_bits_in = d->m_bits_in; + + if (!use_raw_block) + comp_block_succeeded = + compress_block(d, (d->m_flags & XDEFLATE_FORCE_STATIC_BLOCKS) || + (d->m_total_lz_bytes < 48)); + + if (((use_raw_block) || + ((d->m_total_lz_bytes) && ((d->m_pOutput_buf - pSaved_output_buf + 1U) >= + d->m_total_lz_bytes))) && + ((d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size)) { + uint32_t i; + d->m_pOutput_buf = pSaved_output_buf; + d->m_bit_buffer = saved_bit_buf; + d->m_bits_in = saved_bits_in; + PUT_BITS(0, 2); + if (d->m_bits_in) { + PUT_BITS(0, 8 - d->m_bits_in); + } + for (i = 2; i; --i, d->m_total_lz_bytes ^= 0xFFFF) { + PUT_BITS(d->m_total_lz_bytes & 0xFFFF, 16); + } + for (i = 0; i < d->m_total_lz_bytes; ++i) { + PUT_BITS(d->m_dict[(d->m_lz_code_buf_dict_pos + i) & + XDEFLATE_LZ_DICT_SIZE_MASK], + 8); + } + } else if (!comp_block_succeeded) { + d->m_pOutput_buf = pSaved_output_buf; + d->m_bit_buffer = saved_bit_buf; + d->m_bits_in = saved_bits_in; + compress_block(d, 1); + } + + if (flush) { + if (flush == XDEFLATE_FINISH) { + if (d->m_bits_in) { + PUT_BITS(0, 8 - d->m_bits_in); + } + if (d->m_flags & XDEFLATE_WRITE_ZLIB_HEADER) { + uint32_t i, a = d->m_adler32; + for (i = 0; i < 4; i++) { + PUT_BITS((a >> 24) & 0xFF, 8); + a <<= 8; + } + } + } else { + uint32_t i, z = 0; + PUT_BITS(0, 3); + if (d->m_bits_in) { + PUT_BITS(0, 8 - d->m_bits_in); + } + for (i = 2; i; --i, z ^= 0xFFFF) { + PUT_BITS(z & 0xFFFF, 16); + } + } + } + + XDEFLATE_ASSERT(d->m_pOutput_buf < d->m_pOutput_buf_end); + + memset(&d->m_huff_count[0][0], 0, + sizeof(d->m_huff_count[0][0]) * XDEFLATE_MAX_HUFF_SYMBOLS_0); + memset(&d->m_huff_count[1][0], 0, + sizeof(d->m_huff_count[1][0]) * XDEFLATE_MAX_HUFF_SYMBOLS_1); + + d->m_pLZ_code_buf = d->m_lz_code_buf + 1; + d->m_pLZ_flags = d->m_lz_code_buf; + d->m_num_flags_left = 8; + d->m_lz_code_buf_dict_pos += d->m_total_lz_bytes; + d->m_total_lz_bytes = 0; + d->m_block_index++; + + if ((n = (int)(d->m_pOutput_buf - pOutput_buf_start)) != 0) { + if (d->m_pPut_buf_func) { + *d->m_pIn_buf_size = d->m_pSrc - (const uint8_t*)d->m_pIn_buf; + if (!(*d->m_pPut_buf_func)(d->m_output_buf, n, d->m_pPut_buf_user)) + return (d->m_prev_return_status = XDEFLATE_STATUS_PUT_BUF_FAILED); + } else if (pOutput_buf_start == d->m_output_buf) { + int bytes_to_copy = (int)XDEFLATE_MIN( + (size_t)n, (size_t)(*d->m_pOut_buf_size - d->m_out_buf_ofs)); + memcpy((uint8_t*)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf, + bytes_to_copy); + d->m_out_buf_ofs += bytes_to_copy; + if ((n -= bytes_to_copy) != 0) { + d->m_output_flush_ofs = bytes_to_copy; + d->m_output_flush_remaining = n; + } + } else { + d->m_out_buf_ofs += n; + } + } + + return d->m_output_flush_remaining; +} + +XDEFLATE_FORCEINLINE void find_match(xdeflate_compressor* d, + uint32_t lookahead_pos, uint32_t max_dist, + uint32_t max_match_len, + uint32_t* pMatch_dist, + uint32_t* pMatch_len) { + uint32_t dist, pos = lookahead_pos & XDEFLATE_LZ_DICT_SIZE_MASK, + match_len = *pMatch_len, probe_pos = pos, next_probe_pos, + probe_len; + uint32_t num_probes_left = d->m_max_probes[match_len >= 32]; + const uint8_t *s = d->m_dict + pos, *p, *q; + uint8_t c0 = d->m_dict[pos + match_len], c1 = d->m_dict[pos + match_len - 1]; + XDEFLATE_ASSERT(max_match_len <= XDEFLATE_MAX_MATCH_LEN); + if (max_match_len <= match_len) return; + for (;;) { + for (;;) { + if (--num_probes_left == 0) return; + next_probe_pos = d->m_next[probe_pos]; + if ((!next_probe_pos) || + ((dist = (uint16_t)(lookahead_pos - next_probe_pos)) > max_dist)) + return; + probe_pos = next_probe_pos & XDEFLATE_LZ_DICT_SIZE_MASK; + if ((d->m_dict[probe_pos + match_len] == c0) && + (d->m_dict[probe_pos + match_len - 1] == c1)) + break; + } + if (!dist) break; + p = s; + q = d->m_dict + probe_pos; + for (probe_len = 0; probe_len < max_match_len; probe_len++) + if (*p++ != *q++) break; + if (probe_len > match_len) { + *pMatch_dist = dist; + if ((*pMatch_len = match_len = probe_len) == max_match_len) return; + c0 = d->m_dict[pos + match_len]; + c1 = d->m_dict[pos + match_len - 1]; + } + } +} + +XDEFLATE_FORCEINLINE void record_literal(xdeflate_compressor* d, uint8_t lit) { + d->m_total_lz_bytes++; + *d->m_pLZ_code_buf++ = lit; + *d->m_pLZ_flags = (uint8_t)(*d->m_pLZ_flags >> 1); + if (--d->m_num_flags_left == 0) { + d->m_num_flags_left = 8; + d->m_pLZ_flags = d->m_pLZ_code_buf++; + } + d->m_huff_count[0][lit]++; +} + +XDEFLATE_FORCEINLINE void record_match(xdeflate_compressor* d, + uint32_t match_len, + uint32_t match_dist) { + uint32_t s0, s1; + + XDEFLATE_ASSERT((match_len >= XDEFLATE_MIN_MATCH_LEN) && (match_dist >= 1) && + (match_dist <= XDEFLATE_LZ_DICT_SIZE)); + + d->m_total_lz_bytes += match_len; + + d->m_pLZ_code_buf[0] = (uint8_t)(match_len - XDEFLATE_MIN_MATCH_LEN); + + match_dist -= 1; + d->m_pLZ_code_buf[1] = (uint8_t)(match_dist & 0xFF); + d->m_pLZ_code_buf[2] = (uint8_t)(match_dist >> 8); + d->m_pLZ_code_buf += 3; + + *d->m_pLZ_flags = (uint8_t)((*d->m_pLZ_flags >> 1) | 0x80); + if (--d->m_num_flags_left == 0) { + d->m_num_flags_left = 8; + d->m_pLZ_flags = d->m_pLZ_code_buf++; + } + + s0 = s_small_dist_sym[match_dist & 511]; + s1 = s_large_dist_sym[(match_dist >> 8) & 127]; + d->m_huff_count[1][(match_dist < 512) ? s0 : s1]++; + d->m_huff_count[0][s_len_sym[match_len - XDEFLATE_MIN_MATCH_LEN]]++; +} + +static int compress_normal(xdeflate_compressor* d) { + const uint8_t* pSrc = d->m_pSrc; + size_t src_buf_left = d->m_src_buf_left; + xdeflate_flush flush = d->m_flush; + + while ((src_buf_left) || ((flush) && (d->m_lookahead_size))) { + uint32_t len_to_move, cur_match_dist, cur_match_len, cur_pos; + if ((d->m_lookahead_size + d->m_dict_size) >= + (XDEFLATE_MIN_MATCH_LEN - 1)) { + uint32_t dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & + XDEFLATE_LZ_DICT_SIZE_MASK; + uint32_t ins_pos = d->m_lookahead_pos + d->m_lookahead_size - 2; + uint32_t hash = (d->m_dict[ins_pos & XDEFLATE_LZ_DICT_SIZE_MASK] + << XDEFLATE_LZ_HASH_SHIFT) ^ + d->m_dict[(ins_pos + 1) & XDEFLATE_LZ_DICT_SIZE_MASK]; + uint32_t num_bytes_to_process = (uint32_t)XDEFLATE_MIN( + src_buf_left, XDEFLATE_MAX_MATCH_LEN - d->m_lookahead_size); + const uint8_t* pSrc_end = pSrc ? pSrc + num_bytes_to_process : NULL; + src_buf_left -= num_bytes_to_process; + d->m_lookahead_size += num_bytes_to_process; + while (pSrc != pSrc_end) { + uint8_t c = *pSrc++; + d->m_dict[dst_pos] = c; + if (dst_pos < (XDEFLATE_MAX_MATCH_LEN - 1)) + d->m_dict[XDEFLATE_LZ_DICT_SIZE + dst_pos] = c; + hash = ((hash << XDEFLATE_LZ_HASH_SHIFT) ^ c) & + (XDEFLATE_LZ_HASH_SIZE - 1); + d->m_next[ins_pos & XDEFLATE_LZ_DICT_SIZE_MASK] = d->m_hash[hash]; + d->m_hash[hash] = (uint16_t)(ins_pos); + dst_pos = (dst_pos + 1) & XDEFLATE_LZ_DICT_SIZE_MASK; + ins_pos++; + } + } else { + while ((src_buf_left) && (d->m_lookahead_size < XDEFLATE_MAX_MATCH_LEN)) { + uint8_t c = *pSrc++; + uint32_t dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & + XDEFLATE_LZ_DICT_SIZE_MASK; + src_buf_left--; + d->m_dict[dst_pos] = c; + if (dst_pos < (XDEFLATE_MAX_MATCH_LEN - 1)) + d->m_dict[XDEFLATE_LZ_DICT_SIZE + dst_pos] = c; + if ((++d->m_lookahead_size + d->m_dict_size) >= + XDEFLATE_MIN_MATCH_LEN) { + uint32_t ins_pos = d->m_lookahead_pos + (d->m_lookahead_size - 1) - 2; + uint32_t hash = + ((d->m_dict[ins_pos & XDEFLATE_LZ_DICT_SIZE_MASK] + << (XDEFLATE_LZ_HASH_SHIFT * 2)) ^ + (d->m_dict[(ins_pos + 1) & XDEFLATE_LZ_DICT_SIZE_MASK] + << XDEFLATE_LZ_HASH_SHIFT) ^ + c) & + (XDEFLATE_LZ_HASH_SIZE - 1); + d->m_next[ins_pos & XDEFLATE_LZ_DICT_SIZE_MASK] = d->m_hash[hash]; + d->m_hash[hash] = (uint16_t)(ins_pos); + } + } + } + d->m_dict_size = XDEFLATE_MIN(XDEFLATE_LZ_DICT_SIZE - d->m_lookahead_size, + d->m_dict_size); + if ((!flush) && (d->m_lookahead_size < XDEFLATE_MAX_MATCH_LEN)) break; + + len_to_move = 1; + cur_match_dist = 0; + cur_match_len = d->m_saved_match_len ? d->m_saved_match_len + : (XDEFLATE_MIN_MATCH_LEN - 1); + cur_pos = d->m_lookahead_pos & XDEFLATE_LZ_DICT_SIZE_MASK; + if (d->m_flags & (XDEFLATE_RLE_MATCHES | XDEFLATE_FORCE_RAW_BLOCKS)) { + if ((d->m_dict_size) && (!(d->m_flags & XDEFLATE_FORCE_RAW_BLOCKS))) { + uint8_t c = d->m_dict[(cur_pos - 1) & XDEFLATE_LZ_DICT_SIZE_MASK]; + cur_match_len = 0; + while (cur_match_len < d->m_lookahead_size) { + if (d->m_dict[cur_pos + cur_match_len] != c) break; + cur_match_len++; + } + if (cur_match_len < XDEFLATE_MIN_MATCH_LEN) + cur_match_len = 0; + else + cur_match_dist = 1; + } + } else { + find_match(d, d->m_lookahead_pos, d->m_dict_size, d->m_lookahead_size, + &cur_match_dist, &cur_match_len); + } + if (((cur_match_len == XDEFLATE_MIN_MATCH_LEN) && + (cur_match_dist >= 8U * 1024U)) || + (cur_pos == cur_match_dist) || + ((d->m_flags & XDEFLATE_FILTER_MATCHES) && (cur_match_len <= 5))) { + cur_match_dist = cur_match_len = 0; + } + if (d->m_saved_match_len) { + if (cur_match_len > d->m_saved_match_len) { + record_literal(d, (uint8_t)d->m_saved_lit); + if (cur_match_len >= 128) { + record_match(d, cur_match_len, cur_match_dist); + d->m_saved_match_len = 0; + len_to_move = cur_match_len; + } else { + d->m_saved_lit = d->m_dict[cur_pos]; + d->m_saved_match_dist = cur_match_dist; + d->m_saved_match_len = cur_match_len; + } + } else { + record_match(d, d->m_saved_match_len, d->m_saved_match_dist); + len_to_move = d->m_saved_match_len - 1; + d->m_saved_match_len = 0; + } + } else if (!cur_match_dist) + record_literal(d, + d->m_dict[XDEFLATE_MIN(cur_pos, sizeof(d->m_dict) - 1)]); + else if ((d->m_greedy_parsing) || (d->m_flags & XDEFLATE_RLE_MATCHES) || + (cur_match_len >= 128)) { + record_match(d, cur_match_len, cur_match_dist); + len_to_move = cur_match_len; + } else { + d->m_saved_lit = d->m_dict[XDEFLATE_MIN(cur_pos, sizeof(d->m_dict) - 1)]; + d->m_saved_match_dist = cur_match_dist; + d->m_saved_match_len = cur_match_len; + } + d->m_lookahead_pos += len_to_move; + XDEFLATE_ASSERT(d->m_lookahead_size >= len_to_move); + d->m_lookahead_size -= len_to_move; + d->m_dict_size = XDEFLATE_MIN(d->m_dict_size + len_to_move, + (uint32_t)XDEFLATE_LZ_DICT_SIZE); + if ((d->m_pLZ_code_buf > + &d->m_lz_code_buf[XDEFLATE_LZ_CODE_BUF_SIZE - 8]) || + ((d->m_total_lz_bytes > 31 * 1024) && + (((((uint32_t)(d->m_pLZ_code_buf - d->m_lz_code_buf) * 115) >> 7) >= + d->m_total_lz_bytes) || + (d->m_flags & XDEFLATE_FORCE_RAW_BLOCKS)))) { + int n; + d->m_pSrc = pSrc; + d->m_src_buf_left = src_buf_left; + if ((n = flush_block(d, 0)) != 0) return (n < 0) ? 0 : 1; + } + } + + d->m_pSrc = pSrc; + d->m_src_buf_left = src_buf_left; + return 1; +} + +static xdeflate_status flush_output_buffer(xdeflate_compressor* d) { + if (d->m_pIn_buf_size) { + *d->m_pIn_buf_size = d->m_pSrc - (const uint8_t*)d->m_pIn_buf; + } + + if (d->m_pOut_buf_size) { + size_t n = XDEFLATE_MIN(*d->m_pOut_buf_size - d->m_out_buf_ofs, + d->m_output_flush_remaining); + memcpy((uint8_t*)d->m_pOut_buf + d->m_out_buf_ofs, + d->m_output_buf + d->m_output_flush_ofs, n); + d->m_output_flush_ofs += (uint32_t)n; + d->m_output_flush_remaining -= (uint32_t)n; + d->m_out_buf_ofs += n; + + *d->m_pOut_buf_size = d->m_out_buf_ofs; + } + + return (d->m_finished && !d->m_output_flush_remaining) ? XDEFLATE_STATUS_DONE + : XDEFLATE_STATUS_OKAY; +} + +/* ============================================================================ + * Public Compression API + * ============================================================================ + */ + +xdeflate_status xdeflate_init(xdeflate_compressor* d, + xdeflate_put_buf_func put_buf_func, + void* put_buf_user, int flags) { + d->m_pPut_buf_func = put_buf_func; + d->m_pPut_buf_user = put_buf_user; + d->m_flags = (uint32_t)(flags); + d->m_max_probes[0] = 1 + ((flags & 0xFFF) + 2) / 3; + d->m_greedy_parsing = (flags & XDEFLATE_GREEDY_PARSING) != 0; + d->m_max_probes[1] = 1 + (((flags & 0xFFF) >> 2) + 2) / 3; + if (!(flags & XDEFLATE_NONDETERMINISTIC)) XDEFLATE_CLEAR_ARR(d->m_hash); + d->m_lookahead_pos = d->m_lookahead_size = d->m_dict_size = + d->m_total_lz_bytes = d->m_lz_code_buf_dict_pos = d->m_bits_in = 0; + d->m_output_flush_ofs = d->m_output_flush_remaining = d->m_finished = + d->m_block_index = d->m_bit_buffer = d->m_wants_to_finish = 0; + d->m_pLZ_code_buf = d->m_lz_code_buf + 1; + d->m_pLZ_flags = d->m_lz_code_buf; + *d->m_pLZ_flags = 0; + d->m_num_flags_left = 8; + d->m_pOutput_buf = d->m_output_buf; + d->m_pOutput_buf_end = d->m_output_buf; + d->m_prev_return_status = XDEFLATE_STATUS_OKAY; + d->m_saved_match_dist = d->m_saved_match_len = d->m_saved_lit = 0; + d->m_adler32 = 1; + d->m_pIn_buf = NULL; + d->m_pOut_buf = NULL; + d->m_pIn_buf_size = NULL; + d->m_pOut_buf_size = NULL; + d->m_flush = XDEFLATE_NO_FLUSH; + d->m_pSrc = NULL; + d->m_src_buf_left = 0; + d->m_out_buf_ofs = 0; + if (!(flags & XDEFLATE_NONDETERMINISTIC)) XDEFLATE_CLEAR_ARR(d->m_dict); + memset(&d->m_huff_count[0][0], 0, + sizeof(d->m_huff_count[0][0]) * XDEFLATE_MAX_HUFF_SYMBOLS_0); + memset(&d->m_huff_count[1][0], 0, + sizeof(d->m_huff_count[1][0]) * XDEFLATE_MAX_HUFF_SYMBOLS_1); + return XDEFLATE_STATUS_OKAY; +} + +xdeflate_status xdeflate_compress(xdeflate_compressor* d, const void* in_buf, + size_t* in_buf_size, void* out_buf, + size_t* out_buf_size, xdeflate_flush flush) { + if (!d) { + if (in_buf_size) *in_buf_size = 0; + if (out_buf_size) *out_buf_size = 0; + return XDEFLATE_STATUS_BAD_PARAM; + } + + d->m_pIn_buf = in_buf; + d->m_pIn_buf_size = in_buf_size; + d->m_pOut_buf = out_buf; + d->m_pOut_buf_size = out_buf_size; + d->m_pSrc = (const uint8_t*)(in_buf); + d->m_src_buf_left = in_buf_size ? *in_buf_size : 0; + d->m_out_buf_ofs = 0; + d->m_flush = flush; + + if (((d->m_pPut_buf_func != NULL) == + ((out_buf != NULL) || (out_buf_size != NULL))) || + (d->m_prev_return_status != XDEFLATE_STATUS_OKAY) || + (d->m_wants_to_finish && (flush != XDEFLATE_FINISH)) || + (in_buf_size && *in_buf_size && !in_buf) || + (out_buf_size && *out_buf_size && !out_buf)) { + if (in_buf_size) *in_buf_size = 0; + if (out_buf_size) *out_buf_size = 0; + return (d->m_prev_return_status = XDEFLATE_STATUS_BAD_PARAM); + } + d->m_wants_to_finish |= (flush == XDEFLATE_FINISH); + + if ((d->m_output_flush_remaining) || (d->m_finished)) + return (d->m_prev_return_status = flush_output_buffer(d)); + + if (!compress_normal(d)) return d->m_prev_return_status; + + if ((d->m_flags & (XDEFLATE_WRITE_ZLIB_HEADER | XDEFLATE_COMPUTE_ADLER32)) && + (in_buf)) + d->m_adler32 = xdeflate_adler32_update( + d->m_adler32, (const uint8_t*)in_buf, + (size_t)(d->m_pSrc - (const uint8_t*)in_buf)); + + if ((flush) && (!d->m_lookahead_size) && (!d->m_src_buf_left) && + (!d->m_output_flush_remaining)) { + if (flush_block(d, flush) < 0) return d->m_prev_return_status; + d->m_finished = (flush == XDEFLATE_FINISH); + if (flush == XDEFLATE_FULL_FLUSH) { + XDEFLATE_CLEAR_ARR(d->m_hash); + XDEFLATE_CLEAR_ARR(d->m_next); + d->m_dict_size = 0; + } + } + + return (d->m_prev_return_status = flush_output_buffer(d)); +} + +uint32_t xdeflate_get_adler32(xdeflate_compressor* d) { return d->m_adler32; } + +uint32_t xdeflate_create_flags(int level, int window_bits, int strategy) { + uint32_t comp_flags = + s_num_probes[(level >= 0) ? XDEFLATE_MIN(10, level) : 6] | + ((level <= 3) ? XDEFLATE_GREEDY_PARSING : 0); + if (window_bits > 0) comp_flags |= XDEFLATE_WRITE_ZLIB_HEADER; + + if (!level) + comp_flags |= XDEFLATE_FORCE_RAW_BLOCKS; + else if (strategy == 1) /* filtered */ + comp_flags |= XDEFLATE_FILTER_MATCHES; + else if (strategy == 2) /* huffman only */ + comp_flags &= ~XDEFLATE_MAX_PROBES_MASK; + else if (strategy == 4) /* fixed */ + comp_flags |= XDEFLATE_FORCE_STATIC_BLOCKS; + else if (strategy == 3) /* rle */ + comp_flags |= XDEFLATE_RLE_MATCHES; + + return comp_flags; +} + +/* ============================================================================ + * Decompression Implementation + * ============================================================================ + */ + +#define XINFLATE_MEMCPY(d, s, l) memcpy(d, s, l) +#define XINFLATE_MEMSET(p, c, l) memset(p, c, l) + +#define XINFLATE_CR_BEGIN \ + switch (r->m_state) { \ + case 0: +#define XINFLATE_CR_RETURN(state_index, result) \ + do { \ + status = result; \ + r->m_state = state_index; \ + goto common_exit; \ + case state_index:; \ + } \ + XDEFLATE_MACRO_END +#define XINFLATE_CR_RETURN_FOREVER(state_index, result) \ + do { \ + for (;;) { \ + XINFLATE_CR_RETURN(state_index, result); \ + } \ + } \ + XDEFLATE_MACRO_END +#define XINFLATE_CR_FINISH } + +#define XINFLATE_GET_BYTE(state_index, c) \ + do { \ + while (pIn_buf_cur >= pIn_buf_end) { \ + XINFLATE_CR_RETURN(state_index, \ + (decomp_flags & XINFLATE_FLAG_HAS_MORE_INPUT) \ + ? XINFLATE_STATUS_NEEDS_MORE_INPUT \ + : XINFLATE_STATUS_FAILED_CANNOT_MAKE_PROGRESS); \ + } \ + (c) = *pIn_buf_cur++; \ + } \ + XDEFLATE_MACRO_END + +#define XINFLATE_NEED_BITS(state_index, n) \ + do { \ + uint32_t c; \ + XINFLATE_GET_BYTE(state_index, c); \ + bit_buf |= (((xinflate_bit_buf_t)c) << num_bits); \ + num_bits += 8; \ + } while (num_bits < (uint32_t)(n)) +#define XINFLATE_SKIP_BITS(state_index, n) \ + do { \ + if (num_bits < (uint32_t)(n)) { \ + XINFLATE_NEED_BITS(state_index, n); \ + } \ + bit_buf >>= (n); \ + num_bits -= (n); \ + } \ + XDEFLATE_MACRO_END +#define XINFLATE_GET_BITS(state_index, b, n) \ + do { \ + if (num_bits < (uint32_t)(n)) { \ + XINFLATE_NEED_BITS(state_index, n); \ + } \ + (b) = bit_buf & ((1 << (n)) - 1); \ + bit_buf >>= (n); \ + num_bits -= (n); \ + } \ + XDEFLATE_MACRO_END + +#define XINFLATE_HUFF_BITBUF_FILL(state_index, pLookUp, pTree) \ + do { \ + temp = (pLookUp)[bit_buf & (XINFLATE_FAST_LOOKUP_SIZE - 1)]; \ + if (temp >= 0) { \ + code_len = temp >> 9; \ + if ((code_len) && (num_bits >= code_len)) break; \ + } else if (num_bits > XINFLATE_FAST_LOOKUP_BITS) { \ + code_len = XINFLATE_FAST_LOOKUP_BITS; \ + do { \ + temp = (pTree)[~temp + ((bit_buf >> code_len++) & 1)]; \ + } while ((temp < 0) && (num_bits >= (code_len + 1))); \ + if (temp >= 0) break; \ + } \ + XINFLATE_GET_BYTE(state_index, c); \ + bit_buf |= (((xinflate_bit_buf_t)c) << num_bits); \ + num_bits += 8; \ + } while (num_bits < 15); + +#define XINFLATE_HUFF_DECODE(state_index, sym, pLookUp, pTree) \ + do { \ + int temp; \ + uint32_t code_len, c; \ + if (num_bits < 15) { \ + if ((pIn_buf_end - pIn_buf_cur) < 2) { \ + XINFLATE_HUFF_BITBUF_FILL(state_index, pLookUp, pTree); \ + } else { \ + bit_buf |= (((xinflate_bit_buf_t)pIn_buf_cur[0]) << num_bits) | \ + (((xinflate_bit_buf_t)pIn_buf_cur[1]) << (num_bits + 8)); \ + pIn_buf_cur += 2; \ + num_bits += 16; \ + } \ + } \ + if ((temp = (pLookUp)[bit_buf & (XINFLATE_FAST_LOOKUP_SIZE - 1)]) >= 0) \ + code_len = temp >> 9, temp &= 511; \ + else { \ + code_len = XINFLATE_FAST_LOOKUP_BITS; \ + do { \ + temp = (pTree)[~temp + ((bit_buf >> code_len++) & 1)]; \ + } while (temp < 0); \ + } \ + (sym) = temp; \ + bit_buf >>= code_len; \ + num_bits -= code_len; \ + } \ + XDEFLATE_MACRO_END + +static void xinflate_clear_tree(xinflate_decompressor* r) { + if (r->m_type == 0) + XDEFLATE_CLEAR_ARR(r->m_tree_0); + else if (r->m_type == 1) + XDEFLATE_CLEAR_ARR(r->m_tree_1); + else + XDEFLATE_CLEAR_ARR(r->m_tree_2); +} + +xinflate_status xinflate_decompress( + xinflate_decompressor* r, const uint8_t* pIn_buf_next, size_t* pIn_buf_size, + uint8_t* pOut_buf_start, uint8_t* pOut_buf_next, size_t* pOut_buf_size, + uint32_t decomp_flags) { + static const uint16_t s_length_base[31] = { + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; + static const uint8_t s_length_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, + 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, + 4, 4, 5, 5, 5, 5, 0, 0, 0}; + static const uint16_t s_dist_base[32] = { + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, + 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, + 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0}; + static const uint8_t s_dist_extra[32] = { + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; + static const uint8_t s_length_dezigzag[19] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + static const uint16_t s_min_table_sizes[3] = {257, 1, 4}; + + int16_t* pTrees[3]; + uint8_t* pCode_sizes[3]; + + xinflate_status status = XINFLATE_STATUS_FAILED; + uint32_t num_bits, dist, counter, num_extra; + xinflate_bit_buf_t bit_buf; + const uint8_t *pIn_buf_cur = pIn_buf_next, *const pIn_buf_end = + pIn_buf_next + *pIn_buf_size; + uint8_t *pOut_buf_cur = pOut_buf_next, *const pOut_buf_end = + pOut_buf_next ? pOut_buf_next + + *pOut_buf_size + : NULL; + size_t out_buf_size_mask = + (decomp_flags & XINFLATE_FLAG_USING_NON_WRAPPING_OUTPUT_BUF) + ? (size_t)-1 + : ((pOut_buf_next - pOut_buf_start) + *pOut_buf_size) - 1, + dist_from_out_buf_start; + + if (((out_buf_size_mask + 1) & out_buf_size_mask) || + (pOut_buf_next < pOut_buf_start)) { + *pIn_buf_size = *pOut_buf_size = 0; + return XINFLATE_STATUS_BAD_PARAM; + } + + pTrees[0] = r->m_tree_0; + pTrees[1] = r->m_tree_1; + pTrees[2] = r->m_tree_2; + pCode_sizes[0] = r->m_code_size_0; + pCode_sizes[1] = r->m_code_size_1; + pCode_sizes[2] = r->m_code_size_2; + + num_bits = r->m_num_bits; + bit_buf = r->m_bit_buf; + dist = r->m_dist; + counter = r->m_counter; + num_extra = r->m_num_extra; + dist_from_out_buf_start = r->m_dist_from_out_buf_start; + XINFLATE_CR_BEGIN + + bit_buf = num_bits = dist = counter = num_extra = r->m_zhdr0 = r->m_zhdr1 = 0; + r->m_z_adler32 = r->m_check_adler32 = 1; + if (decomp_flags & XINFLATE_FLAG_PARSE_ZLIB_HEADER) { + XINFLATE_GET_BYTE(1, r->m_zhdr0); + XINFLATE_GET_BYTE(2, r->m_zhdr1); + counter = (((r->m_zhdr0 * 256 + r->m_zhdr1) % 31 != 0) || + (r->m_zhdr1 & 32) || ((r->m_zhdr0 & 15) != 8)); + if (!(decomp_flags & XINFLATE_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)) + counter |= (((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) || + ((out_buf_size_mask + 1) < + (size_t)((size_t)1 << (8U + (r->m_zhdr0 >> 4))))); + if (counter) { + XINFLATE_CR_RETURN_FOREVER(36, XINFLATE_STATUS_FAILED); + } + } + + do { + XINFLATE_GET_BITS(3, r->m_final, 3); + r->m_type = r->m_final >> 1; + if (r->m_type == 0) { + XINFLATE_SKIP_BITS(5, num_bits & 7); + for (counter = 0; counter < 4; ++counter) { + if (num_bits) + XINFLATE_GET_BITS(6, r->m_raw_header[counter], 8); + else + XINFLATE_GET_BYTE(7, r->m_raw_header[counter]); + } + if ((counter = (r->m_raw_header[0] | (r->m_raw_header[1] << 8))) != + (uint32_t)(0xFFFF ^ + (r->m_raw_header[2] | (r->m_raw_header[3] << 8)))) { + XINFLATE_CR_RETURN_FOREVER(39, XINFLATE_STATUS_FAILED); + } + while ((counter) && (num_bits)) { + XINFLATE_GET_BITS(51, dist, 8); + while (pOut_buf_cur >= pOut_buf_end) { + XINFLATE_CR_RETURN(52, XINFLATE_STATUS_HAS_MORE_OUTPUT); + } + *pOut_buf_cur++ = (uint8_t)dist; + counter--; + } + while (counter) { + size_t n; + while (pOut_buf_cur >= pOut_buf_end) { + XINFLATE_CR_RETURN(9, XINFLATE_STATUS_HAS_MORE_OUTPUT); + } + while (pIn_buf_cur >= pIn_buf_end) { + XINFLATE_CR_RETURN(38, + (decomp_flags & XINFLATE_FLAG_HAS_MORE_INPUT) + ? XINFLATE_STATUS_NEEDS_MORE_INPUT + : XINFLATE_STATUS_FAILED_CANNOT_MAKE_PROGRESS); + } + n = XDEFLATE_MIN(XDEFLATE_MIN((size_t)(pOut_buf_end - pOut_buf_cur), + (size_t)(pIn_buf_end - pIn_buf_cur)), + counter); + XINFLATE_MEMCPY(pOut_buf_cur, pIn_buf_cur, n); + pIn_buf_cur += n; + pOut_buf_cur += n; + counter -= (uint32_t)n; + } + } else if (r->m_type == 3) { + XINFLATE_CR_RETURN_FOREVER(10, XINFLATE_STATUS_FAILED); + } else { + if (r->m_type == 1) { + uint8_t* p = r->m_code_size_0; + uint32_t i; + r->m_table_sizes[0] = 288; + r->m_table_sizes[1] = 32; + XINFLATE_MEMSET(r->m_code_size_1, 5, 32); + for (i = 0; i <= 143; ++i) *p++ = 8; + for (; i <= 255; ++i) *p++ = 9; + for (; i <= 279; ++i) *p++ = 7; + for (; i <= 287; ++i) *p++ = 8; + } else { + for (counter = 0; counter < 3; counter++) { + XINFLATE_GET_BITS(11, r->m_table_sizes[counter], + "\05\05\04"[counter]); + r->m_table_sizes[counter] += s_min_table_sizes[counter]; + } + XDEFLATE_CLEAR_ARR(r->m_code_size_2); + for (counter = 0; counter < r->m_table_sizes[2]; counter++) { + uint32_t s; + XINFLATE_GET_BITS(14, s, 3); + r->m_code_size_2[s_length_dezigzag[counter]] = (uint8_t)s; + } + r->m_table_sizes[2] = 19; + } + for (; (int)r->m_type >= 0; r->m_type--) { + int tree_next, tree_cur; + int16_t* pLookUp; + int16_t* pTree; + uint8_t* pCode_size; + uint32_t i, j, used_syms, total, sym_index, next_code[17], + total_syms[16]; + pLookUp = r->m_look_up[r->m_type]; + pTree = pTrees[r->m_type]; + pCode_size = pCode_sizes[r->m_type]; + XDEFLATE_CLEAR_ARR(total_syms); + XINFLATE_MEMSET(pLookUp, 0, sizeof(r->m_look_up[0])); + xinflate_clear_tree(r); + for (i = 0; i < r->m_table_sizes[r->m_type]; ++i) + total_syms[pCode_size[i]]++; + used_syms = 0, total = 0; + next_code[0] = next_code[1] = 0; + for (i = 1; i <= 15; ++i) { + used_syms += total_syms[i]; + next_code[i + 1] = (total = ((total + total_syms[i]) << 1)); + } + if ((65536 != total) && (used_syms > 1)) { + XINFLATE_CR_RETURN_FOREVER(35, XINFLATE_STATUS_FAILED); + } + for (tree_next = -1, sym_index = 0; + sym_index < r->m_table_sizes[r->m_type]; ++sym_index) { + uint32_t rev_code = 0, l, cur_code, code_size = pCode_size[sym_index]; + if (!code_size) continue; + cur_code = next_code[code_size]++; + for (l = code_size; l > 0; l--, cur_code >>= 1) + rev_code = (rev_code << 1) | (cur_code & 1); + if (code_size <= XINFLATE_FAST_LOOKUP_BITS) { + int16_t k = (int16_t)((code_size << 9) | sym_index); + while (rev_code < XINFLATE_FAST_LOOKUP_SIZE) { + pLookUp[rev_code] = k; + rev_code += (1 << code_size); + } + continue; + } + if (0 == (tree_cur = + pLookUp[rev_code & (XINFLATE_FAST_LOOKUP_SIZE - 1)])) { + pLookUp[rev_code & (XINFLATE_FAST_LOOKUP_SIZE - 1)] = + (int16_t)tree_next; + tree_cur = tree_next; + tree_next -= 2; + } + rev_code >>= (XINFLATE_FAST_LOOKUP_BITS - 1); + for (j = code_size; j > (XINFLATE_FAST_LOOKUP_BITS + 1); j--) { + tree_cur -= ((rev_code >>= 1) & 1); + if (!pTree[-tree_cur - 1]) { + pTree[-tree_cur - 1] = (int16_t)tree_next; + tree_cur = tree_next; + tree_next -= 2; + } else + tree_cur = pTree[-tree_cur - 1]; + } + tree_cur -= ((rev_code >>= 1) & 1); + pTree[-tree_cur - 1] = (int16_t)sym_index; + } + if (r->m_type == 2) { + for (counter = 0; + counter < (r->m_table_sizes[0] + r->m_table_sizes[1]);) { + uint32_t s; + XINFLATE_HUFF_DECODE(16, dist, r->m_look_up[2], r->m_tree_2); + if (dist < 16) { + r->m_len_codes[counter++] = (uint8_t)dist; + continue; + } + if ((dist == 16) && (!counter)) { + XINFLATE_CR_RETURN_FOREVER(17, XINFLATE_STATUS_FAILED); + } + num_extra = "\02\03\07"[dist - 16]; + XINFLATE_GET_BITS(18, s, num_extra); + s += "\03\03\013"[dist - 16]; + XINFLATE_MEMSET(r->m_len_codes + counter, + (dist == 16) ? r->m_len_codes[counter - 1] : 0, s); + counter += s; + } + if ((r->m_table_sizes[0] + r->m_table_sizes[1]) != counter) { + XINFLATE_CR_RETURN_FOREVER(21, XINFLATE_STATUS_FAILED); + } + XINFLATE_MEMCPY(r->m_code_size_0, r->m_len_codes, + r->m_table_sizes[0]); + XINFLATE_MEMCPY(r->m_code_size_1, + r->m_len_codes + r->m_table_sizes[0], + r->m_table_sizes[1]); + } + } + for (;;) { + uint8_t* pSrc; + for (;;) { + if (((pIn_buf_end - pIn_buf_cur) < 4) || + ((pOut_buf_end - pOut_buf_cur) < 2)) { + XINFLATE_HUFF_DECODE(23, counter, r->m_look_up[0], r->m_tree_0); + if (counter >= 256) break; + while (pOut_buf_cur >= pOut_buf_end) { + XINFLATE_CR_RETURN(24, XINFLATE_STATUS_HAS_MORE_OUTPUT); + } + *pOut_buf_cur++ = (uint8_t)counter; + } else { + int sym2; + uint32_t code_len; +#if XDEFLATE_HAS_64BIT_REGISTERS + if (num_bits < 30) { + bit_buf |= (((xinflate_bit_buf_t)xdeflate_read_le32(pIn_buf_cur)) + << num_bits); + pIn_buf_cur += 4; + num_bits += 32; + } +#else + if (num_bits < 15) { + bit_buf |= (((xinflate_bit_buf_t)xdeflate_read_le16(pIn_buf_cur)) + << num_bits); + pIn_buf_cur += 2; + num_bits += 16; + } +#endif + if ((sym2 = r->m_look_up[0][bit_buf & + (XINFLATE_FAST_LOOKUP_SIZE - 1)]) >= 0) + code_len = sym2 >> 9; + else { + code_len = XINFLATE_FAST_LOOKUP_BITS; + do { + sym2 = r->m_tree_0[~sym2 + ((bit_buf >> code_len++) & 1)]; + } while (sym2 < 0); + } + counter = sym2; + bit_buf >>= code_len; + num_bits -= code_len; + if (counter & 256) break; + +#if !XDEFLATE_HAS_64BIT_REGISTERS + if (num_bits < 15) { + bit_buf |= (((xinflate_bit_buf_t)xdeflate_read_le16(pIn_buf_cur)) + << num_bits); + pIn_buf_cur += 2; + num_bits += 16; + } +#endif + if ((sym2 = r->m_look_up[0][bit_buf & + (XINFLATE_FAST_LOOKUP_SIZE - 1)]) >= 0) + code_len = sym2 >> 9; + else { + code_len = XINFLATE_FAST_LOOKUP_BITS; + do { + sym2 = r->m_tree_0[~sym2 + ((bit_buf >> code_len++) & 1)]; + } while (sym2 < 0); + } + bit_buf >>= code_len; + num_bits -= code_len; + + pOut_buf_cur[0] = (uint8_t)counter; + if (sym2 & 256) { + pOut_buf_cur++; + counter = sym2; + break; + } + pOut_buf_cur[1] = (uint8_t)sym2; + pOut_buf_cur += 2; + } + } + if ((counter &= 511) == 256) break; + + num_extra = s_length_extra[counter - 257]; + counter = s_length_base[counter - 257]; + if (num_extra) { + uint32_t extra_bits; + XINFLATE_GET_BITS(25, extra_bits, num_extra); + counter += extra_bits; + } + + XINFLATE_HUFF_DECODE(26, dist, r->m_look_up[1], r->m_tree_1); + num_extra = s_dist_extra[dist]; + dist = s_dist_base[dist]; + if (num_extra) { + uint32_t extra_bits; + XINFLATE_GET_BITS(27, extra_bits, num_extra); + dist += extra_bits; + } + + dist_from_out_buf_start = pOut_buf_cur - pOut_buf_start; + if ((dist == 0 || dist > dist_from_out_buf_start || + dist_from_out_buf_start == 0) && + (decomp_flags & XINFLATE_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)) { + XINFLATE_CR_RETURN_FOREVER(37, XINFLATE_STATUS_FAILED); + } + + pSrc = pOut_buf_start + + ((dist_from_out_buf_start - dist) & out_buf_size_mask); + + if ((XDEFLATE_MAX(pOut_buf_cur, pSrc) + counter) > pOut_buf_end) { + while (counter--) { + while (pOut_buf_cur >= pOut_buf_end) { + XINFLATE_CR_RETURN(53, XINFLATE_STATUS_HAS_MORE_OUTPUT); + } + *pOut_buf_cur++ = + pOut_buf_start[(dist_from_out_buf_start++ - dist) & + out_buf_size_mask]; + } + continue; + } + while (counter > 2) { + pOut_buf_cur[0] = pSrc[0]; + pOut_buf_cur[1] = pSrc[1]; + pOut_buf_cur[2] = pSrc[2]; + pOut_buf_cur += 3; + pSrc += 3; + counter -= 3; + } + if (counter > 0) { + pOut_buf_cur[0] = pSrc[0]; + if (counter > 1) pOut_buf_cur[1] = pSrc[1]; + pOut_buf_cur += counter; + } + } + } + } while (!(r->m_final & 1)); + + XINFLATE_SKIP_BITS(32, num_bits & 7); + while ((pIn_buf_cur > pIn_buf_next) && (num_bits >= 8)) { + --pIn_buf_cur; + num_bits -= 8; + } + bit_buf &= ~(~(xinflate_bit_buf_t)0 << num_bits); + XDEFLATE_ASSERT(!num_bits); + + if (decomp_flags & XINFLATE_FLAG_PARSE_ZLIB_HEADER) { + for (counter = 0; counter < 4; ++counter) { + uint32_t s; + if (num_bits) + XINFLATE_GET_BITS(41, s, 8); + else + XINFLATE_GET_BYTE(42, s); + r->m_z_adler32 = (r->m_z_adler32 << 8) | s; + } + } + XINFLATE_CR_RETURN_FOREVER(34, XINFLATE_STATUS_DONE); + + XINFLATE_CR_FINISH + +common_exit: + if ((status != XINFLATE_STATUS_NEEDS_MORE_INPUT) && + (status != XINFLATE_STATUS_FAILED_CANNOT_MAKE_PROGRESS)) { + while ((pIn_buf_cur > pIn_buf_next) && (num_bits >= 8)) { + --pIn_buf_cur; + num_bits -= 8; + } + } + r->m_num_bits = num_bits; + r->m_bit_buf = bit_buf & ~(~(xinflate_bit_buf_t)0 << num_bits); + r->m_dist = dist; + r->m_counter = counter; + r->m_num_extra = num_extra; + r->m_dist_from_out_buf_start = dist_from_out_buf_start; + *pIn_buf_size = pIn_buf_cur - pIn_buf_next; + *pOut_buf_size = pOut_buf_cur - pOut_buf_next; + if ((decomp_flags & + (XINFLATE_FLAG_PARSE_ZLIB_HEADER | XINFLATE_FLAG_COMPUTE_ADLER32)) && + (status >= 0)) { + const uint8_t* ptr = pOut_buf_next; + size_t buf_len = *pOut_buf_size; + uint32_t i, s1 = r->m_check_adler32 & 0xffff, s2 = r->m_check_adler32 >> 16; + size_t block_len = buf_len % 5552; + while (buf_len) { + for (i = 0; i + 7 < block_len; i += 8, ptr += 8) { + s1 += ptr[0], s2 += s1; + s1 += ptr[1], s2 += s1; + s1 += ptr[2], s2 += s1; + s1 += ptr[3], s2 += s1; + s1 += ptr[4], s2 += s1; + s1 += ptr[5], s2 += s1; + s1 += ptr[6], s2 += s1; + s1 += ptr[7], s2 += s1; + } + for (; i < block_len; ++i) s1 += *ptr++, s2 += s1; + s1 %= 65521U, s2 %= 65521U; + buf_len -= block_len; + block_len = 5552; + } + r->m_check_adler32 = (s2 << 16) + s1; + if ((status == XINFLATE_STATUS_DONE) && + (decomp_flags & XINFLATE_FLAG_PARSE_ZLIB_HEADER) && + (r->m_check_adler32 != r->m_z_adler32)) + status = XINFLATE_STATUS_ADLER32_MISMATCH; + } + return status; +} + +size_t xinflate_decompress_mem_to_mem(void* out_buf, size_t out_buf_len, + const void* src_buf, size_t src_buf_len, + int flags) { + xinflate_decompressor decomp; + xinflate_status status; + xinflate_init(&decomp); + status = + xinflate_decompress(&decomp, (const uint8_t*)src_buf, &src_buf_len, + (uint8_t*)out_buf, (uint8_t*)out_buf, &out_buf_len, + (flags & ~XINFLATE_FLAG_HAS_MORE_INPUT) | + XINFLATE_FLAG_USING_NON_WRAPPING_OUTPUT_BUF); + return (status != XINFLATE_STATUS_DONE) + ? XINFLATE_DECOMPRESS_MEM_TO_MEM_FAILED + : out_buf_len; +} + #define GZ_MAGIC0 0x1fu #define GZ_MAGIC1 0x8bu #define GZ_METHOD_DEFLATE 0x08u +#define GZ_FLG_FTEXT 0x01u +#define GZ_FLG_FHCRC 0x02u +#define GZ_FLG_FEXTRA 0x04u +#define GZ_FLG_FNAME 0x08u +#define GZ_FLG_FCOMMENT 0x10u +#define GZ_FLG_RESERVED 0xe0u #define GZ_OS_UNKNOWN 0xffu #define GZ_HEADER_LEN 10u #define GZ_TRAILER_LEN 8u -#define DEFLATE_STORED_MAX 0xffffu /* max stored-block payload */ static uint32_t crc32_update(uint32_t crc, const uint8_t* data, size_t len) { size_t i; @@ -28,11 +1971,6 @@ static int gz_write(CfreeWriter* out, const void* data, size_t n) { return cfree_writer_write(out, data, n) == CFREE_OK ? DIST_OK : DIST_ERR; } -static void put_u16le(uint8_t* p, uint16_t v) { - p[0] = (uint8_t)(v & 0xffu); - p[1] = (uint8_t)((v >> 8) & 0xffu); -} - static void put_u32le(uint8_t* p, uint32_t v) { p[0] = (uint8_t)(v & 0xffu); p[1] = (uint8_t)((v >> 8) & 0xffu); @@ -49,11 +1987,78 @@ static uint32_t get_u32le(const uint8_t* p) { ((uint32_t)p[3] << 24); } +static int gz_put_deflate(const void* data, int len, void* user) { + if (len <= 0) return 1; + return gz_write((CfreeWriter*)user, data, (size_t)len) == DIST_OK; +} + +static int gz_skip_header_bytes(const uint8_t* data, size_t trailer_off, + size_t* off, size_t n, uint32_t* hcrc) { + if (n > trailer_off - *off) return DIST_ERR; + *hcrc = crc32_update(*hcrc, data + *off, n); + *off += n; + return DIST_OK; +} + +static int gz_skip_header_zstr(const uint8_t* data, size_t trailer_off, + size_t* off, uint32_t* hcrc) { + while (*off < trailer_off) { + uint8_t c = data[*off]; + *hcrc = crc32_update(*hcrc, data + *off, 1); + ++*off; + if (c == 0) return DIST_OK; + } + return DIST_ERR; +} + +static int gz_parse_header(const uint8_t* data, size_t len, size_t* body_off) { + size_t off = GZ_HEADER_LEN; + size_t trailer_off; + uint32_t hcrc; + uint8_t flg; + + if (len < GZ_HEADER_LEN + GZ_TRAILER_LEN) return DIST_ERR; + trailer_off = len - GZ_TRAILER_LEN; + if (data[0] != GZ_MAGIC0 || data[1] != GZ_MAGIC1 || + data[2] != GZ_METHOD_DEFLATE) { + return DIST_ERR; + } + + flg = data[3]; + if (flg & GZ_FLG_RESERVED) return DIST_ERR; + hcrc = crc32_update(0, data, GZ_HEADER_LEN); + + if (flg & GZ_FLG_FEXTRA) { + uint16_t xlen; + if (off + 2u > trailer_off) return DIST_ERR; + xlen = get_u16le(data + off); + if (gz_skip_header_bytes(data, trailer_off, &off, 2u, &hcrc) != DIST_OK || + gz_skip_header_bytes(data, trailer_off, &off, xlen, &hcrc) != DIST_OK) + return DIST_ERR; + } + if ((flg & GZ_FLG_FNAME) && + gz_skip_header_zstr(data, trailer_off, &off, &hcrc) != DIST_OK) + return DIST_ERR; + if ((flg & GZ_FLG_FCOMMENT) && + gz_skip_header_zstr(data, trailer_off, &off, &hcrc) != DIST_OK) + return DIST_ERR; + if (flg & GZ_FLG_FHCRC) { + if (off + 2u > trailer_off) return DIST_ERR; + if (get_u16le(data + off) != (uint16_t)hcrc) return DIST_ERR; + off += 2u; + } + + *body_off = off; + return DIST_OK; +} + int dist_gz_compress(CfreeWriter* out, const uint8_t* data, size_t len) { + xdeflate_compressor def; + xdeflate_status st; uint8_t hdr[GZ_HEADER_LEN]; - uint8_t blk[5]; uint8_t trailer[GZ_TRAILER_LEN]; - size_t off = 0; + size_t in_len = len; + uint32_t flags; memset(hdr, 0, sizeof hdr); hdr[0] = GZ_MAGIC0; @@ -63,20 +2068,12 @@ int dist_gz_compress(CfreeWriter* out, const uint8_t* data, size_t len) { hdr[9] = GZ_OS_UNKNOWN; if (gz_write(out, hdr, sizeof hdr) != DIST_OK) return DIST_ERR; - /* One or more stored DEFLATE blocks. Each: header byte (BFINAL in bit 0, - * BTYPE=00 in bits 1-2), then LEN and ~LEN little-endian, then raw bytes. */ - do { - size_t chunk = len - off; - int last; - if (chunk > DEFLATE_STORED_MAX) chunk = DEFLATE_STORED_MAX; - last = (off + chunk == len); - blk[0] = last ? 0x01u : 0x00u; - put_u16le(blk + 1, (uint16_t)chunk); - put_u16le(blk + 3, (uint16_t)(~(uint16_t)chunk)); - if (gz_write(out, blk, sizeof blk) != DIST_OK) return DIST_ERR; - if (chunk && gz_write(out, data + off, chunk) != DIST_OK) return DIST_ERR; - off += chunk; - } while (off < len); + flags = xdeflate_create_flags(6, -15, 0); + if (xdeflate_init(&def, gz_put_deflate, out, (int)flags) != + XDEFLATE_STATUS_OKAY) + return DIST_ERR; + st = xdeflate_compress(&def, data, &in_len, NULL, NULL, XDEFLATE_FINISH); + if (st != XDEFLATE_STATUS_DONE || in_len != len) return DIST_ERR; put_u32le(trailer, crc32_update(0, data, len)); put_u32le(trailer + 4, (uint32_t)len); @@ -84,40 +2081,44 @@ int dist_gz_compress(CfreeWriter* out, const uint8_t* data, size_t len) { } int dist_gz_decompress(CfreeWriter* out, const uint8_t* data, size_t len) { - size_t off = GZ_HEADER_LEN; + xinflate_decompressor inf; + uint8_t ring[XINFLATE_LZ_DICT_SIZE]; + size_t off; + size_t comp_len; + size_t in_ofs = 0; + uint64_t total = 0; uint32_t crc = 0; - uint32_t total = 0; int done = 0; - if (len < GZ_HEADER_LEN + GZ_TRAILER_LEN) return DIST_ERR; - if (data[0] != GZ_MAGIC0 || data[1] != GZ_MAGIC1 || - data[2] != GZ_METHOD_DEFLATE) { - return DIST_ERR; - } - if (data[3] != 0u) return DIST_ERR; /* stub emits no header flags */ + if (gz_parse_header(data, len, &off) != DIST_OK) return DIST_ERR; + comp_len = (len - GZ_TRAILER_LEN) - off; + xinflate_init(&inf); while (!done) { - uint8_t bhdr; - uint16_t blen, bnlen; - if (off + 5u > len) return DIST_ERR; - bhdr = data[off]; - if (((bhdr >> 1) & 0x3u) != 0u) return DIST_ERR; /* non-stored block */ - done = (bhdr & 0x1u) != 0u; - blen = get_u16le(data + off + 1); - bnlen = get_u16le(data + off + 3); - if ((uint16_t)~blen != bnlen) return DIST_ERR; - off += 5u; - if (off + blen > len) return DIST_ERR; - if (blen) { - if (gz_write(out, data + off, blen) != DIST_OK) return DIST_ERR; - crc = crc32_update(crc, data + off, blen); - total += blen; - off += blen; - } - } - - if (off + GZ_TRAILER_LEN > len) return DIST_ERR; + size_t ring_ofs = (size_t)total & (XINFLATE_LZ_DICT_SIZE - 1u); + size_t in_avail = comp_len - in_ofs; + size_t out_avail = XINFLATE_LZ_DICT_SIZE - ring_ofs; + xinflate_status st = + xinflate_decompress(&inf, data + off + in_ofs, &in_avail, ring, + ring + ring_ofs, &out_avail, 0); + in_ofs += in_avail; + if (out_avail) { + if (gz_write(out, ring + ring_ofs, out_avail) != DIST_OK) + return DIST_ERR; + crc = crc32_update(crc, ring + ring_ofs, out_avail); + if (total + out_avail < total) return DIST_ERR; + total += out_avail; + } + if (st == XINFLATE_STATUS_DONE) { + done = 1; + } else if (st != XINFLATE_STATUS_HAS_MORE_OUTPUT) { + return DIST_ERR; + } + } + + if (in_ofs != comp_len) return DIST_ERR; + off = len - GZ_TRAILER_LEN; if (get_u32le(data + off) != crc) return DIST_ERR; - if (get_u32le(data + off + 4) != total) return DIST_ERR; + if (get_u32le(data + off + 4) != (uint32_t)total) return DIST_ERR; return DIST_OK; } diff --git a/driver/dist/deflate.h b/driver/dist/deflate.h @@ -7,19 +7,15 @@ #include "dist.h" -/* gzip wrap/unwrap for the payload archive. - * - * The gzip container, CRC32, and DEFLATE *stored* blocks are real: the output - * is a valid `.gz` that stock `gunzip` can read. What is STUBBED is actual - * compression — `dist_gz_compress` emits stored (uncompressed) blocks, and - * `dist_gz_decompress` only understands stored blocks (it errors on a - * compressed block). Real LZ77 + Huffman is the work to vendor later. */ +/* gzip wrap/unwrap for the payload archive. The gzip container, CRC32, and + * raw DEFLATE payload are real; output is a valid `.gz` that stock `gunzip` + * can read. */ -/* Wrap `data` in a gzip stream of stored blocks, writing to `out`. */ +/* Compress `data` into a gzip stream, writing to `out`. */ int dist_gz_compress(CfreeWriter* out, const uint8_t* data, size_t len); /* Unwrap a gzip stream into `out`. Returns DIST_ERR on a malformed stream, a - * CRC/size mismatch, or a non-stored DEFLATE block. */ + * CRC/size mismatch, or invalid DEFLATE payload. */ int dist_gz_decompress(CfreeWriter* out, const uint8_t* data, size_t len); #endif diff --git a/driver/dist/dist.h b/driver/dist/dist.h @@ -7,23 +7,22 @@ /* Shared constants and small helpers for the code-distribution subsystem * (`cfree pkg`). See doc/DISTRIBUTE.md for the design. * - * The crypto/compression primitives under driver/dist/ are STUBS during - * bootstrap: deterministic, functional, and INSECURE. They exist so the - * end-to-end packaging pipeline (manifest -> sign -> bundle -> verify -> - * unpack) can be exercised before the real vendored implementations land. - * Every stub is marked as such at its definition. */ + * Cryptographic and compression primitives under driver/dist/ are vendored so + * the package pipeline (manifest -> sign -> bundle -> verify -> unpack) has no + * runtime dependency on host crypto/compression libraries. */ /* Primitive output sizes. */ -#define DIST_BLAKE2B_LEN 64u +#define DIST_BLAKE2B_LEN 32u #define DIST_HASH_LEN DIST_BLAKE2B_LEN +#define DIST_MINISIG_PREHASH_LEN 64u #define DIST_ED25519_PK_LEN 32u #define DIST_ED25519_SK_LEN 64u #define DIST_ED25519_SIG_LEN 64u #define DIST_ED25519_SEED_LEN 32u #define DIST_KEYID_LEN 8u -/* Fixed parse/build capacities. The stub phase avoids dynamic arrays; these - * caps bound a single package's metadata and member count. */ +/* Fixed parse/build capacities. These caps bound a single package's metadata + * and member count. */ #define DIST_MAX_ARTIFACTS 64u #define DIST_MAX_DEPS 64u #define DIST_MAX_FILES 256u diff --git a/driver/dist/ed25519.c b/driver/dist/ed25519.c @@ -2,43 +2,23 @@ #include <string.h> -#include "blake2b.h" - -/* STUB scheme. See ed25519.h. pk == seed; signatures are derived purely from - * the seed and message via the stub hash, and verification recomputes them - * from pk (== seed). Insecure by construction; deterministic and - * round-tripping for pipeline testing. */ +#include "vendor/monocypher/monocypher-ed25519.h" void dist_ed25519_keypair(uint8_t pk[DIST_ED25519_PK_LEN], uint8_t sk[DIST_ED25519_SK_LEN], const uint8_t seed[DIST_ED25519_SEED_LEN]) { - memcpy(pk, seed, DIST_ED25519_SEED_LEN); - memcpy(sk, seed, DIST_ED25519_SEED_LEN); - memcpy(sk + DIST_ED25519_SEED_LEN, pk, DIST_ED25519_PK_LEN); -} - -/* Derive a signature from a 32-byte seed and a message digest. */ -static void stub_sign_with_seed(uint8_t sig[DIST_ED25519_SIG_LEN], - const uint8_t seed[DIST_ED25519_SEED_LEN], - const uint8_t* msg, size_t msglen) { - DistBlake2b h; - static const uint8_t dom[] = "cfree stub ed25519"; - dist_blake2b_init(&h, DIST_ED25519_SIG_LEN); - dist_blake2b_update(&h, dom, sizeof dom - 1); - dist_blake2b_update(&h, seed, DIST_ED25519_SEED_LEN); - dist_blake2b_update(&h, msg, msglen); - dist_blake2b_final(&h, sig); + uint8_t seed_copy[DIST_ED25519_SEED_LEN]; + memcpy(seed_copy, seed, sizeof seed_copy); + crypto_ed25519_key_pair(sk, pk, seed_copy); } void dist_ed25519_sign(uint8_t sig[DIST_ED25519_SIG_LEN], const uint8_t* msg, size_t msglen, const uint8_t sk[DIST_ED25519_SK_LEN]) { - stub_sign_with_seed(sig, sk, msg, msglen); /* seed = sk[0:32] */ + crypto_ed25519_sign(sig, sk, msg, msglen); } int dist_ed25519_verify(const uint8_t sig[DIST_ED25519_SIG_LEN], const uint8_t* msg, size_t msglen, const uint8_t pk[DIST_ED25519_PK_LEN]) { - uint8_t expect[DIST_ED25519_SIG_LEN]; - stub_sign_with_seed(expect, pk, msg, msglen); /* pk == seed in the stub */ - return memcmp(expect, sig, DIST_ED25519_SIG_LEN) == 0 ? 1 : 0; + return crypto_ed25519_check(sig, pk, msg, msglen) == 0 ? 1 : 0; } diff --git a/driver/dist/ed25519.h b/driver/dist/ed25519.h @@ -6,13 +6,7 @@ #include "dist.h" -/* Signature scheme used by minisign. - * - * STUB: this is NOT Ed25519 and provides NO security. The "public key" is the - * seed itself, so anyone holding the public key can forge signatures. It is a - * deterministic placeholder that verifies correctly end-to-end so the - * packaging/trust pipeline can be exercised. Replace with real Ed25519 - * (and a real CSPRNG for keygen) before trusting any of this. */ +/* Signature scheme used by minisign. */ /* Derive a keypair from a 32-byte seed. */ void dist_ed25519_keypair(uint8_t pk[DIST_ED25519_PK_LEN], diff --git a/driver/dist/lz4.c b/driver/dist/lz4.c @@ -1,28 +1,39 @@ #include "lz4.h" -/* STUB: raw LZ4 block support is an external vendored primitive. The API is - * real so the package format and callers will not change when the - * implementation lands. */ +#include <limits.h> + +#define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1 +#define LZ4LIB_VISIBILITY +#include "vendor/lz4/lz4.c" size_t dist_lz4_compress_bound(size_t raw_len) { - return raw_len + raw_len / 255u + 16u; + int bound; + if (raw_len > (size_t)LZ4_MAX_INPUT_SIZE) return 0; + bound = LZ4_compressBound((int)raw_len); + if (bound <= 0) return 0; + return (size_t)bound; } int dist_lz4_compress_block(uint8_t* dst, size_t dst_cap, size_t* dst_len, const uint8_t* src, size_t src_len) { - (void)dst; - (void)dst_cap; - (void)dst_len; - (void)src; - (void)src_len; - return DIST_ERR; + int n; + if (!dst || !dst_len || (!src && src_len != 0)) return DIST_ERR; + if (src_len > (size_t)LZ4_MAX_INPUT_SIZE || dst_cap > (size_t)INT_MAX) + return DIST_ERR; + n = LZ4_compress_default((const char*)src, (char*)dst, (int)src_len, + (int)dst_cap); + if (n <= 0) return DIST_ERR; + *dst_len = (size_t)n; + return DIST_OK; } int dist_lz4_decompress_block(uint8_t* dst, size_t dst_len, const uint8_t* src, size_t src_len) { - (void)dst; - (void)dst_len; - (void)src; - (void)src_len; - return DIST_ERR; + int n; + if (!dst || (!src && src_len != 0)) return DIST_ERR; + if (dst_len > (size_t)INT_MAX || src_len > (size_t)INT_MAX) return DIST_ERR; + n = LZ4_decompress_safe((const char*)src, (char*)dst, (int)src_len, + (int)dst_len); + if (n != (int)dst_len) return DIST_ERR; + return DIST_OK; } diff --git a/driver/dist/minisig.c b/driver/dist/minisig.c @@ -19,7 +19,7 @@ #define MS_ALG_LEN 2u #define MS_SALT_LEN 32u #define MS_LIMIT_LEN 8u -#define MS_CHK_LEN 32u /* BLAKE2b-256 checksum (first 32 of the stub's 64) */ +#define MS_CHK_LEN 32u /* BLAKE2b-256 checksum */ /* Public-key payload: sig_alg || keyid || pk. */ #define KEY_BLOB_LEN (MS_ALG_LEN + DIST_KEYID_LEN + DIST_ED25519_PK_LEN) @@ -50,12 +50,13 @@ static void seckey_checksum(uint8_t out[MS_CHK_LEN], const uint8_t keyid[DIST_KEYID_LEN], const uint8_t sk[DIST_ED25519_SK_LEN]) { uint8_t buf[MS_ALG_LEN + DIST_KEYID_LEN + DIST_ED25519_SK_LEN]; - uint8_t full[DIST_BLAKE2B_LEN]; + DistBlake2b h; memcpy(buf, MS_SIGALG, MS_ALG_LEN); memcpy(buf + MS_ALG_LEN, keyid, DIST_KEYID_LEN); memcpy(buf + MS_ALG_LEN + DIST_KEYID_LEN, sk, DIST_ED25519_SK_LEN); - dist_blake2b(full, buf, sizeof buf); - memcpy(out, full, MS_CHK_LEN); + dist_blake2b_init(&h, MS_CHK_LEN); + dist_blake2b_update(&h, buf, sizeof buf); + dist_blake2b_final(&h, out); } static int emit_str(CfreeWriter* out, const char* s) { @@ -135,6 +136,14 @@ static int decode_line(const uint8_t* data, size_t len, size_t idx, return got == want ? DIST_OK : DIST_ERR; } +static void minisig_prehash(uint8_t out[DIST_MINISIG_PREHASH_LEN], + const uint8_t* msg, size_t msglen) { + DistBlake2b h; + dist_blake2b_init(&h, DIST_MINISIG_PREHASH_LEN); + dist_blake2b_update(&h, msg, msglen); + dist_blake2b_final(&h, out); +} + int dist_minisig_parse_pubkey(const uint8_t* data, size_t len, uint8_t pk[DIST_ED25519_PK_LEN], uint8_t keyid[DIST_KEYID_LEN]) { @@ -171,7 +180,7 @@ int dist_minisig_sign(CfreeWriter* out, const uint8_t* msg, size_t msglen, const uint8_t keyid[DIST_KEYID_LEN], const char* untrusted_comment, const char* trusted_comment) { - uint8_t prehash[DIST_BLAKE2B_LEN]; + uint8_t prehash[DIST_MINISIG_PREHASH_LEN]; uint8_t sig[DIST_ED25519_SIG_LEN]; uint8_t gsig[DIST_ED25519_SIG_LEN]; uint8_t line1[SIG_LINE1_LEN]; @@ -181,8 +190,8 @@ int dist_minisig_sign(CfreeWriter* out, const uint8_t* msg, size_t msglen, if (tclen >= DIST_TRUSTED_COMMENT_MAX) return DIST_ERR; - /* Signature over the BLAKE2b prehash of the message. */ - dist_blake2b(prehash, msg, msglen); + /* Signature over stock minisign's 64-byte BLAKE2b prehash. */ + minisig_prehash(prehash, msg, msglen); dist_ed25519_sign(sig, prehash, sizeof prehash, sk); /* Global signature also covers the trusted comment. */ @@ -222,7 +231,7 @@ int dist_minisig_verify(const uint8_t* sig, size_t siglen, const uint8_t* msg, char* out_trusted, size_t trusted_cap) { uint8_t blob[SIG_LINE1_LEN]; uint8_t gsig[DIST_ED25519_SIG_LEN]; - uint8_t prehash[DIST_BLAKE2B_LEN]; + uint8_t prehash[DIST_MINISIG_PREHASH_LEN]; uint8_t gmsg[DIST_ED25519_SIG_LEN + DIST_TRUSTED_COMMENT_MAX]; char tcline[SIG_LINE_MAX]; const char* tc; @@ -236,9 +245,9 @@ int dist_minisig_verify(const uint8_t* sig, size_t siglen, const uint8_t* msg, prehashed = (memcmp(blob, MS_SIGALG_HASHED, MS_ALG_LEN) == 0); if (!prehashed && memcmp(blob, MS_SIGALG, MS_ALG_LEN) != 0) return DIST_ERR; - /* "ED" signs a BLAKE2b prehash of the message; "Ed" signs it raw. */ + /* "ED" signs stock minisign's 64-byte BLAKE2b prehash; "Ed" signs raw. */ if (prehashed) { - dist_blake2b(prehash, msg, msglen); + minisig_prehash(prehash, msg, msglen); if (!dist_ed25519_verify(sigbytes, prehash, sizeof prehash, pk)) return DIST_ERR; } else if (!dist_ed25519_verify(sigbytes, msg, msglen, pk)) { diff --git a/driver/dist/minisig.h b/driver/dist/minisig.h @@ -10,18 +10,17 @@ /* minisign key and signature files, using minisign's exact on-disk byte * layout: * - public key : base64( "Ed" || keyid[8] || pk[32] ) - * - signature : base64( "ED" || keyid[8] || sig[64] ) over a BLAKE2b - * prehash, plus a global signature over the trusted comment + * - signature : base64( "ED" || keyid[8] || sig[64] ) over stock + * minisign's 64-byte prehash, plus a global signature over + * the trusted comment * - secret key : base64( "Ed" || kdf_alg[2] || "B2" || salt[32] || * opslimit[8] || memlimit[8] || keyid[8] || sk[64] || * chk[32] ), passwordless (kdf_alg = {0,0}, no scrypt). * - * So once the real Ed25519/BLAKE2b primitives are vendored, these files are - * interchangeable with stock `minisign` — a passwordless minisign key/sig can - * be pointed at directly. Password-encrypted secret keys (kdf_alg = "Sc") are - * recognized and rejected with a clear error until scrypt is vendored. The - * crypto values themselves are STUBBED today (see the dist primitive headers). - * See doc/DISTRIBUTE.md. */ + * These files are interchangeable with stock `minisign`: a passwordless + * minisign key/sig can be pointed at directly. Password-encrypted secret keys + * (kdf_alg = "Sc") are recognized and rejected with a clear error until scrypt + * is vendored. See doc/DISTRIBUTE.md. */ /* parse_seckey return codes beyond DIST_OK/DIST_ERR. */ #define DIST_ENCRYPTED 2 /* kdf_alg = "Sc": needs scrypt (not yet vendored) */ diff --git a/driver/dist/vendor/lz4/LICENSE b/driver/dist/vendor/lz4/LICENSE @@ -0,0 +1,24 @@ +LZ4 Library +Copyright (c) 2011-2020, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/driver/dist/vendor/lz4/lz4.c b/driver/dist/vendor/lz4/lz4.c @@ -0,0 +1,2829 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2023, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ + +/*-************************************ +* Tuning parameters +**************************************/ +/* + * LZ4_HEAPMODE : + * Select how stateless compression functions like `LZ4_compress_default()` + * allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#ifndef LZ4_HEAPMODE +# define LZ4_HEAPMODE 0 +#endif + +/* + * LZ4_ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define LZ4_ACCELERATION_DEFAULT 1 +/* + * LZ4_ACCELERATION_MAX : + * Any "acceleration" value higher than this threshold + * get treated as LZ4_ACCELERATION_MAX instead (fix #876) + */ +#define LZ4_ACCELERATION_MAX 65537 + + +/*-************************************ +* CPU Feature Detection +**************************************/ +/* LZ4_FORCE_MEMORY_ACCESS + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets which assembly generation depends on alignment. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ +# if defined(__GNUC__) && \ + ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define LZ4_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) || defined(_MSC_VER) +# define LZ4_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit count */ +# undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + + +/*-************************************ +* Dependency +**************************************/ +/* + * LZ4_SRC_INCLUDED: + * Amalgamation flag, whether lz4.c is included + */ +#ifndef LZ4_SRC_INCLUDED +# define LZ4_SRC_INCLUDED 1 +#endif + +#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS +# define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */ +#endif + +#ifndef LZ4_STATIC_LINKING_ONLY +# define LZ4_STATIC_LINKING_ONLY +#endif +#include "lz4.h" +/* see also "memory routines" below */ + + +/*-************************************ +* Compiler Options +**************************************/ +#if defined(_MSC_VER) && (_MSC_VER >= 1400) /* Visual Studio 2005+ */ +# include <intrin.h> /* only present in VS2005+ */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 6237) /* disable: C6237: conditional expression is always 0 */ +# pragma warning(disable : 6239) /* disable: C6239: (<non-zero constant> && <expression>) always evaluates to the result of <expression> */ +# pragma warning(disable : 6240) /* disable: C6240: (<expression> && <non-zero constant>) always evaluates to the result of <expression> */ +# pragma warning(disable : 6326) /* disable: C6326: Potential comparison of a constant with another constant */ +#endif /* _MSC_VER */ + +#ifndef LZ4_FORCE_INLINE +# if defined (_MSC_VER) && !defined (__clang__) /* MSVC */ +# define LZ4_FORCE_INLINE static __forceinline +# else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# if defined (__GNUC__) || defined (__clang__) +# define LZ4_FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define LZ4_FORCE_INLINE static inline +# endif +# else +# define LZ4_FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +# endif /* _MSC_VER */ +#endif /* LZ4_FORCE_INLINE */ + +/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8, + * together with a simple 8-byte copy loop as a fall-back path. + * However, this optimization hurts the decompression speed by >30%, + * because the execution does not go to the optimized loop + * for typical compressible data, and all of the preamble checks + * before going to the fall-back path become useless overhead. + * This optimization happens only with the -O3 flag, and -O2 generates + * a simple 8-byte copy loop. + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8 + * functions are annotated with __attribute__((optimize("O2"))), + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute + * of LZ4_wildCopy8 does not affect the compression speed. + */ +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__) +# define LZ4_FORCE_O2 __attribute__((optimize("O2"))) +# undef LZ4_FORCE_INLINE +# define LZ4_FORCE_INLINE static __inline __attribute__((optimize("O2"),always_inline)) +#else +# define LZ4_FORCE_O2 +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#ifndef likely +#define likely(expr) expect((expr) != 0, 1) +#endif +#ifndef unlikely +#define unlikely(expr) expect((expr) != 0, 0) +#endif + +/* Should the alignment test prove unreliable, for some reason, + * it can be disabled by setting LZ4_ALIGN_TEST to 0 */ +#ifndef LZ4_ALIGN_TEST /* can be externally provided */ +# define LZ4_ALIGN_TEST 1 +#endif + + +/*-************************************ +* Memory routines +**************************************/ + +/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION : + * Disable relatively high-level LZ4/HC functions that use dynamic memory + * allocation functions (malloc(), calloc(), free()). + * + * Note that this is a compile-time switch. And since it disables + * public/stable LZ4 v1 API functions, we don't recommend using this + * symbol to generate a library for distribution. + * + * The following public functions are removed when this symbol is defined. + * - lz4 : LZ4_createStream, LZ4_freeStream, + * LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create (deprecated) + * - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC, + * LZ4_createHC (deprecated), LZ4_freeHC (deprecated) + * - lz4frame, lz4file : All LZ4F_* functions + */ +#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +# define ALLOC(s) lz4_error_memory_allocation_is_disabled +# define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled +# define FREEMEM(p) lz4_error_memory_allocation_is_disabled +#elif defined(LZ4_USER_MEMORY_FUNCTIONS) +/* memory management functions can be customized by user project. + * Below functions must exist somewhere in the Project + * and be available at link time */ +void* LZ4_malloc(size_t s); +void* LZ4_calloc(size_t n, size_t s); +void LZ4_free(void* p); +# define ALLOC(s) LZ4_malloc(s) +# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s) +# define FREEMEM(p) LZ4_free(p) +#else +# include <stdlib.h> /* malloc, calloc, free */ +# define ALLOC(s) malloc(s) +# define ALLOC_AND_ZERO(s) calloc(1,s) +# define FREEMEM(p) free(p) +#endif + +#if ! LZ4_FREESTANDING +# include <string.h> /* memset, memcpy */ +#endif +#if !defined(LZ4_memset) +# define LZ4_memset(p,v,s) memset((p),(v),(s)) +#endif +#define MEM_INIT(p,v,s) LZ4_memset((p),(v),(s)) + + +/*-************************************ +* Common Constants +**************************************/ +#define MINMATCH 4 + +#define WILDCOPYLENGTH 8 +#define LASTLITERALS 5 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MATCH_SAFEGUARD_DISTANCE ((2*WILDCOPYLENGTH) - MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */ +#define FASTLOOP_SAFE_DISTANCE 64 +static const int LZ4_minLength = (MFLIMIT+1); + +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#define LZ4_DISTANCE_ABSOLUTE_MAX 65535 +#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */ +# error "LZ4_DISTANCE_MAX is too big : must be <= 65535" +#endif + +#define ML_BITS 4 +#define ML_MASK ((1U<<ML_BITS)-1) +#define RUN_BITS (8-ML_BITS) +#define RUN_MASK ((1U<<RUN_BITS)-1) + + +/*-************************************ +* Error detection +**************************************/ +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1) +# include <assert.h> +#else +# ifndef assert +# define assert(condition) ((void)0) +# endif +#endif + +#define LZ4_STATIC_ASSERT(c) { enum { LZ4_static_assert = 1/(int)(!!(c)) }; } /* use after variable declarations */ + +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2) +# include <stdio.h> + static int g_debuglog_enable = 1; +# define DEBUGLOG(l, ...) { \ + if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) { \ + fprintf(stderr, __FILE__ " %i: ", __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + +static int LZ4_isAligned(const void* ptr, size_t alignment) +{ + return ((size_t)ptr & (alignment -1)) == 0; +} + + +/*-************************************ +* Types +**************************************/ +#include <limits.h> +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# include <stdint.h> + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef uintptr_t uptrval; +#else +# if UINT_MAX != 4294967295UL +# error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4" +# endif + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; + typedef size_t uptrval; /* generally true, except OpenVMS-64 */ +#endif + +#if defined(__x86_64__) + typedef U64 reg_t; /* 64-bits in x32 mode */ +#else + typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif + +typedef enum { + notLimited = 0, + limitedOutput = 1, + fillOutput = 2 +} limitedOutput_directive; + + +/*-************************************ +* Reading and writing into memory +**************************************/ + +/** + * LZ4 relies on memcpy with a constant size being inlined. In freestanding + * environments, the compiler can't assume the implementation of memcpy() is + * standard compliant, so it can't apply its specialized memcpy() inlining + * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze + * memcpy() as if it were standard compliant, so it can inline it in freestanding + * environments. This is needed when decompressing the Linux Kernel, for example. + */ +#if !defined(LZ4_memcpy) +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) +# else +# define LZ4_memcpy(dst, src, size) memcpy(dst, src, size) +# endif +#endif + +#if !defined(LZ4_memmove) +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define LZ4_memmove __builtin_memmove +# else +# define LZ4_memmove memmove +# endif +#endif + +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define LZ4_PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__)) +#elif defined(_MSC_VER) +#define LZ4_PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop)) +#endif + +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) +/* lie to the compiler about data alignment; use with caution */ + +static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } +static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; } +static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; } + +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } + +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +LZ4_PACK(typedef struct { U16 u16; }) LZ4_unalign16; +LZ4_PACK(typedef struct { U32 u32; }) LZ4_unalign32; +LZ4_PACK(typedef struct { reg_t uArch; }) LZ4_unalignST; + +static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign16*)ptr)->u16; } +static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign32*)ptr)->u32; } +static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalignST*)ptr)->uArch; } + +static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign16*)memPtr)->u16 = value; } +static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign32*)memPtr)->u32 = value; } + +#else /* safe and portable access using memcpy() */ + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; +} + +static reg_t LZ4_read_ARCH(const void* memPtr) +{ + reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + LZ4_memcpy(memPtr, &value, sizeof(value)); +} + +static void LZ4_write32(void* memPtr, U32 value) +{ + LZ4_memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* LZ4_FORCE_MEMORY_ACCESS */ + + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read16(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] | (p[1]<<8)); + } +} + +#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT +static U32 LZ4_readLE32(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read32(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U32)p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24); + } +} +#endif + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) { + LZ4_write16(memPtr, value); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ +LZ4_FORCE_INLINE +void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e); +} + +static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; +static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; + + +#ifndef LZ4_FAST_DEC_LOOP +# if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64 +# define LZ4_FAST_DEC_LOOP 1 +# elif defined(__aarch64__) && defined(__APPLE__) +# define LZ4_FAST_DEC_LOOP 1 +# elif defined(__aarch64__) && !defined(__clang__) + /* On non-Apple aarch64, we disable this optimization for clang because + * on certain mobile chipsets, performance is reduced with clang. For + * more information refer to https://github.com/lz4/lz4/pull/707 */ +# define LZ4_FAST_DEC_LOOP 1 +# else +# define LZ4_FAST_DEC_LOOP 0 +# endif +#endif + +#if LZ4_FAST_DEC_LOOP + +LZ4_FORCE_INLINE void +LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) +{ + assert(srcPtr + offset == dstPtr); + if (offset < 8) { + LZ4_write32(dstPtr, 0); /* silence an msan warning when offset==0 */ + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + srcPtr += inc32table[offset]; + LZ4_memcpy(dstPtr+4, srcPtr, 4); + srcPtr -= dec64table[offset]; + dstPtr += 8; + } else { + LZ4_memcpy(dstPtr, srcPtr, 8); + dstPtr += 8; + srcPtr += 8; + } + + LZ4_wildCopy8(dstPtr, srcPtr, dstEnd); +} + +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd + * this version copies two times 16 bytes (instead of one time 32 bytes) + * because it must be compatible with offsets >= 16. */ +LZ4_FORCE_INLINE void +LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e); +} + +/* LZ4_memcpy_using_offset() presumes : + * - dstEnd >= dstPtr + MINMATCH + * - there is at least 12 bytes available to write after dstEnd */ +LZ4_FORCE_INLINE void +LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) +{ + BYTE v[8]; + + assert(dstEnd >= dstPtr + MINMATCH); + + switch(offset) { + case 1: + MEM_INIT(v, *srcPtr, 8); + break; + case 2: + LZ4_memcpy(v, srcPtr, 2); + LZ4_memcpy(&v[2], srcPtr, 2); +#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */ +# pragma warning(push) +# pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */ +#endif + LZ4_memcpy(&v[4], v, 4); +#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */ +# pragma warning(pop) +#endif + break; + case 4: + LZ4_memcpy(v, srcPtr, 4); + LZ4_memcpy(&v[4], srcPtr, 4); + break; + default: + LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset); + return; + } + + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + while (dstPtr < dstEnd) { + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + } +} +#endif + + +/*-************************************ +* Common functions +**************************************/ +static unsigned LZ4_NbCommonBytes (reg_t val) +{ + assert(val != 0); + if (LZ4_isLittleEndian()) { + if (sizeof(val) == 8) { +# if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(LZ4_FORCE_SW_BITCOUNT) +/*-************************************************************************************************* +* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11. +* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics +* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC. +****************************************************************************************************/ +# if defined(__clang__) && (__clang_major__ < 10) + /* Avoid undefined clang-cl intrinsics issue. + * See https://github.com/lz4/lz4/pull/1017 for details. */ + return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3; +# else + /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */ + return (unsigned)_tzcnt_u64(val) >> 3; +# endif +# elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64(&r, (U64)val); + return (unsigned)r >> 3; +# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_ctzll((U64)val) >> 3; +# else + const U64 m = 0x0101010101010101ULL; + val ^= val - 1; + return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56); +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward(&r, (U32)val); + return (unsigned)r >> 3; +# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_ctz((U32)val) >> 3; +# else + const U32 m = 0x01010101; + return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24; +# endif + } + } else /* Big Endian CPU */ { + if (sizeof(val)==8) { +# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_clzll((U64)val) >> 3; +# else +#if 1 + /* this method is probably faster, + * but adds a 128 bytes lookup table */ + static const unsigned char ctz7_tab[128] = { + 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + }; + U64 const mask = 0x0101010101010101ULL; + U64 const t = (((val >> 8) - mask) | val) & mask; + return ctz7_tab[(t * 0x0080402010080402ULL) >> 57]; +#else + /* this method doesn't consume memory space like the previous one, + * but it contains several branches, + * that may end up slowing execution */ + static const U32 by32 = sizeof(val)*4; /* 32 on 64 bits (goal), 16 on 32 bits. + Just to avoid some static analyzer complaining about shift by 32 on 32-bits target. + Note that this code path is never triggered in 32-bits mode. */ + unsigned r; + if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +#endif +# endif + } else /* 32 bits */ { +# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_clz((U32)val) >> 3; +# else + val >>= 8; + val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) | + (val + 0x00FF0000)) >> 24; + return (unsigned)val ^ 3; +# endif + } + } +} + + +#define STEPSIZE sizeof(reg_t) +LZ4_FORCE_INLINE +unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + if (likely(pIn < pInLimit-(STEPSIZE-1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { + pIn+=STEPSIZE; pMatch+=STEPSIZE; + } else { + return LZ4_NbCommonBytes(diff); + } } + + while (likely(pIn < pInLimit-(STEPSIZE-1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; } + pIn += LZ4_NbCommonBytes(diff); + return (unsigned)(pIn - pStart); + } + + if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; } + if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; } + if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++; + return (unsigned)(pIn - pStart); +} + + +#ifndef LZ4_COMMONDEFS_ONLY +/*-************************************ +* Local Constants +**************************************/ +static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1)); +static const U32 LZ4_skipTrigger = 6; /* Increase this value ==> compression run slower on incompressible data */ + + +/*-************************************ +* Local Structures and types +**************************************/ +typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t; + +/** + * This enum distinguishes several different modes of accessing previous + * content in the stream. + * + * - noDict : There is no preceding content. + * - withPrefix64k : Table entries up to ctx->dictSize before the current blob + * blob being compressed are valid and refer to the preceding + * content (of length ctx->dictSize), which is available + * contiguously preceding in memory the content currently + * being compressed. + * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere + * else in memory, starting at ctx->dictionary with length + * ctx->dictSize. + * - usingDictCtx : Everything concerning the preceding content is + * in a separate context, pointed to by ctx->dictCtx. + * ctx->dictionary, ctx->dictSize, and table entries + * in the current context that refer to positions + * preceding the beginning of the current compression are + * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx + * ->dictSize describe the location and size of the preceding + * content, and matches are found by looking in the ctx + * ->dictCtx->hashTable. + */ +typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + + +/*-************************************ +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); } + + +/*-**************************************** +* Internal Definitions, used only in Tests +*******************************************/ +#if defined (__cplusplus) +extern "C" { +#endif + +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize); + +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, + int compressedSize, int maxOutputSize, + const void* dictStart, size_t dictSize); +int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest, + int compressedSize, int targetOutputSize, int dstCapacity, + const void* dictStart, size_t dictSize); +#if defined (__cplusplus) +} +#endif + +/*-****************************** +* Compression functions +********************************/ +LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + if (LZ4_isLittleEndian()) { + const U64 prime5bytes = 889523592379ULL; + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + } else { + const U64 prime8bytes = 11400714785074694791ULL; + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); + } +} + +LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType) +{ + if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType); + +#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT + return LZ4_hash4(LZ4_readLE32(p), tableType); +#else + return LZ4_hash4(LZ4_read32(p), tableType); +#endif +} + +LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType) +{ + switch (tableType) + { + default: /* fallthrough */ + case clearedTable: { /* illegal! */ assert(0); return; } + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; } + } +} + +LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType) +{ + switch (tableType) + { + default: /* fallthrough */ + case clearedTable: /* fallthrough */ + case byPtr: { /* illegal! */ assert(0); return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; } + case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; } + } +} + +/* LZ4_putPosition*() : only used in byPtr mode */ +LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, + void* tableBase, tableType_t const tableType) +{ + const BYTE** const hashTable = (const BYTE**)tableBase; + assert(tableType == byPtr); (void)tableType; + hashTable[h] = p; +} + +LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType) +{ + U32 const h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType); +} + +/* LZ4_getIndexOnHash() : + * Index of match position registered in hash table. + * hash position must be calculated by using base+index, or dictBase+index. + * Assumption 1 : only valid if tableType == byU32 or byU16. + * Assumption 2 : h is presumed valid (within limits of hash table) + */ +LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType) +{ + LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2); + if (tableType == byU32) { + const U32* const hashTable = (const U32*) tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE-2))); + return hashTable[h]; + } + if (tableType == byU16) { + const U16* const hashTable = (const U16*) tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE-1))); + return hashTable[h]; + } + assert(0); return 0; /* forbidden case */ +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType) +{ + assert(tableType == byPtr); (void)tableType; + { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; } +} + +LZ4_FORCE_INLINE const BYTE* +LZ4_getPosition(const BYTE* p, + const void* tableBase, tableType_t tableType) +{ + U32 const h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType); +} + +LZ4_FORCE_INLINE void +LZ4_prepareTable(LZ4_stream_t_internal* const cctx, + const int inputSize, + const tableType_t tableType) { + /* If the table hasn't been used, it's guaranteed to be zeroed out, and is + * therefore safe to use no matter what mode we're in. Otherwise, we figure + * out if it's safe to leave as is or whether it needs to be reset. + */ + if ((tableType_t)cctx->tableType != clearedTable) { + assert(inputSize >= 0); + if ((tableType_t)cctx->tableType != tableType + || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU) + || ((tableType == byU32) && cctx->currentOffset > 1 GB) + || tableType == byPtr + || inputSize >= 4 KB) + { + DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx); + MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE); + cctx->currentOffset = 0; + cctx->tableType = (U32)clearedTable; + } else { + DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)"); + } + } + + /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, + * is faster than compressing without a gap. + * However, compressing with currentOffset == 0 is faster still, + * so we preserve that case. + */ + if (cctx->currentOffset != 0 && tableType == byU32) { + DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset"); + cctx->currentOffset += 64 KB; + } + + /* Finally, clear history */ + cctx->dictCtx = NULL; + cctx->dictionary = NULL; + cctx->dictSize = 0; +} + +/** LZ4_compress_generic_validated() : + * inlined, to ensure branches are decided at compilation time. + * The following conditions are presumed already validated: + * - source != NULL + * - inputSize > 0 + */ +LZ4_FORCE_INLINE int LZ4_compress_generic_validated( + LZ4_stream_t_internal* const cctx, + const char* const source, + char* const dest, + const int inputSize, + int* inputConsumed, /* only written when outputDirective == fillOutput */ + const int maxOutputSize, + const limitedOutput_directive outputDirective, + const tableType_t tableType, + const dict_directive dictDirective, + const dictIssue_directive dictIssue, + const int acceleration) +{ + int result; + const BYTE* ip = (const BYTE*)source; + + U32 const startIndex = cctx->currentOffset; + const BYTE* base = (const BYTE*)source - startIndex; + const BYTE* lowLimit; + + const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx; + const BYTE* const dictionary = + dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary; + const U32 dictSize = + dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize; + const U32 dictDelta = + (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0; /* make indexes in dictCtx comparable with indexes in current context */ + + int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx); + U32 const prefixIdxLimit = startIndex - dictSize; /* used when dictDirective == dictSmall */ + const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1; + const BYTE* const matchlimit = iend - LASTLITERALS; + + /* the dictCtx currentOffset is indexed on the start of the dictionary, + * while a dictionary in the current context precedes the currentOffset */ + const BYTE* dictBase = (dictionary == NULL) ? NULL : + (dictDirective == usingDictCtx) ? + dictionary + dictSize - dictCtx->currentOffset : + dictionary + dictSize - startIndex; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 offset = 0; + U32 forwardH; + + DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType); + assert(ip != NULL); + if (tableType == byU16) assert(inputSize<LZ4_64Klimit); /* Size too large (not within 64K limit) */ + if (tableType == byPtr) assert(dictDirective==noDict); /* only supported use case with byPtr */ + /* If init conditions are not met, we don't have to mark stream + * as having dirty context, since no action was taken yet */ + if (outputDirective == fillOutput && maxOutputSize < 1) { return 0; } /* Impossible to store anything */ + assert(acceleration >= 1); + + lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0); + + /* Update context state */ + if (dictDirective == usingDictCtx) { + /* Subsequent linked blocks can't use the dictionary. */ + /* Instead, they use the block we just compressed. */ + cctx->dictCtx = NULL; + cctx->dictSize = (U32)inputSize; + } else { + cctx->dictSize += (U32)inputSize; + } + cctx->currentOffset += (U32)inputSize; + cctx->tableType = (U32)tableType; + + if (inputSize<LZ4_minLength) goto _last_literals; /* Input too small, no compression (all literals) */ + + /* First Byte */ + { U32 const h = LZ4_hashPosition(ip, tableType); + if (tableType == byPtr) { + LZ4_putPositionOnHash(ip, h, cctx->hashTable, byPtr); + } else { + LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType); + } } + ip++; forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for ( ; ; ) { + const BYTE* match; + BYTE* token; + const BYTE* filledIp; + + /* Find a match */ + if (tableType == byPtr) { + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; + assert(ip < mflimitPlusOne); + + match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType); + + } while ( (match+LZ4_DISTANCE_MAX < ip) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + + } else { /* byU32, byU16 */ + + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + U32 const current = (U32)(forwardIp - base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex <= current); + assert(forwardIp - base < (ptrdiff_t)(2 GB - 1)); + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; + assert(ip < mflimitPlusOne); + + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + assert(tableType == byU32); + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + matchIndex += dictDelta; /* make dictCtx index comparable with current context */ + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else if (dictDirective == usingExtDict) { + if (matchIndex < startIndex) { + DEBUGLOG(7, "extDict candidate: matchIndex=%5u < startIndex=%5u", matchIndex, startIndex); + assert(startIndex - matchIndex >= MINMATCH); + assert(dictBase); + match = dictBase + matchIndex; + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else { /* single continuous memory segment */ + match = base + matchIndex; + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + + DEBUGLOG(7, "candidate at pos=%u (offset=%u \n", matchIndex, current - matchIndex); + if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; } /* match outside of valid area */ + assert(matchIndex < current); + if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX)) + && (matchIndex+LZ4_DISTANCE_MAX < current)) { + continue; + } /* too far */ + assert((current - matchIndex) <= LZ4_DISTANCE_MAX); /* match now expected within distance */ + + if (LZ4_read32(match) == LZ4_read32(ip)) { + if (maybe_extMem) offset = current - matchIndex; + break; /* match found */ + } + + } while(1); + } + + /* Catch up */ + filledIp = ip; + assert(ip > anchor); /* this is always true as ip has been advanced before entering the main loop */ + if ((match > lowLimit) && unlikely(ip[-1] == match[-1])) { + do { ip--; match--; } while (((ip > anchor) & (match > lowLimit)) && (unlikely(ip[-1] == match[-1]))); + } + + /* Encode Literals */ + { unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputDirective == limitedOutput) && /* Check output buffer overflow */ + (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) { + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + if ((outputDirective == fillOutput) && + (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) { + op--; + goto _last_literals; + } + if (litLength >= RUN_MASK) { + unsigned len = litLength - RUN_MASK; + *token = (RUN_MASK<<ML_BITS); + for(; len >= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength<<ML_BITS); + + /* Copy Literals */ + LZ4_wildCopy8(op, anchor, op+litLength); + op+=litLength; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source)); + } + +_next_match: + /* at this stage, the following variables must be correctly set : + * - ip : at start of LZ operation + * - match : at start of previous pattern occurrence; can be within current prefix, or within extDict + * - offset : if maybe_ext_memSegment==1 (constant) + * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise + * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written + */ + + if ((outputDirective == fillOutput) && + (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) { + /* the match was too close to the end, rewind and go to last literals */ + op = token; + goto _last_literals; + } + + /* Encode Offset */ + if (maybe_extMem) { /* static test */ + DEBUGLOG(6, " with offset=%u (ext if > %i)", offset, (int)(ip - (const BYTE*)source)); + assert(offset <= LZ4_DISTANCE_MAX && offset > 0); + LZ4_writeLE16(op, (U16)offset); op+=2; + } else { + DEBUGLOG(6, " with offset=%u (same segment)", (U32)(ip - match)); + assert(ip-match <= LZ4_DISTANCE_MAX); + LZ4_writeLE16(op, (U16)(ip - match)); op+=2; + } + + /* Encode MatchLength */ + { unsigned matchCode; + + if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx) + && (lowLimit==dictionary) /* match within extDict */ ) { + const BYTE* limit = ip + (dictEnd-match); + assert(dictEnd > match); + if (limit > matchlimit) limit = matchlimit; + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += (size_t)matchCode + MINMATCH; + if (ip==limit) { + unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit); + matchCode += more; + ip += more; + } + DEBUGLOG(6, " with matchLength=%u starting in extDict", matchCode+MINMATCH); + } else { + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += (size_t)matchCode + MINMATCH; + DEBUGLOG(6, " with matchLength=%u", matchCode+MINMATCH); + } + + if ((outputDirective) && /* Check output buffer overflow */ + (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) { + if (outputDirective == fillOutput) { + /* Match description too long : reduce it */ + U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255; + ip -= matchCode - newMatchCode; + assert(newMatchCode < matchCode); + matchCode = newMatchCode; + if (unlikely(ip <= filledIp)) { + /* We have already filled up to filledIp so if ip ends up less than filledIp + * we have positions in the hash table beyond the current position. This is + * a problem if we reuse the hash table. So we have to remove these positions + * from the hash table. + */ + const BYTE* ptr; + DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip)); + for (ptr = ip; ptr <= filledIp; ++ptr) { + U32 const h = LZ4_hashPosition(ptr, tableType); + LZ4_clearHash(h, cctx->hashTable, tableType); + } + } + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + } + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*255) { + op+=4; + LZ4_write32(op, 0xFFFFFFFF); + matchCode -= 4*255; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); + } + /* Ensure we have enough space for the last literals. */ + assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit)); + + anchor = ip; + + /* Test end of chunk */ + if (ip >= mflimitPlusOne) break; + + /* Fill table */ + { U32 const h = LZ4_hashPosition(ip-2, tableType); + if (tableType == byPtr) { + LZ4_putPositionOnHash(ip-2, h, cctx->hashTable, byPtr); + } else { + U32 const idx = (U32)((ip-2) - base); + LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType); + } } + + /* Test next position */ + if (tableType == byPtr) { + + match = LZ4_getPosition(ip, cctx->hashTable, tableType); + LZ4_putPosition(ip, cctx->hashTable, tableType); + if ( (match+LZ4_DISTANCE_MAX >= ip) + && (LZ4_read32(match) == LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + } else { /* byU32, byU16 */ + + U32 const h = LZ4_hashPosition(ip, tableType); + U32 const current = (U32)(ip-base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex < current); + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + assert(tableType == byU32); + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + matchIndex += dictDelta; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; /* required for match length counter */ + } + } else if (dictDirective==usingExtDict) { + if (matchIndex < startIndex) { + assert(dictBase); + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; /* required for match length counter */ + } + } else { /* single memory segment */ + match = base + matchIndex; + } + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + assert(matchIndex < current); + if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) + && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current)) + && (LZ4_read32(match) == LZ4_read32(ip)) ) { + token=op++; + *token=0; + if (maybe_extMem) offset = current - matchIndex; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source)); + goto _next_match; + } + } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + + } + +_last_literals: + /* Encode Last Literals */ + { size_t lastRun = (size_t)(iend - anchor); + if ( (outputDirective) && /* Check output buffer overflow */ + (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) { + if (outputDirective == fillOutput) { + /* adapt lastRun to fill 'dst' */ + assert(olimit >= op); + lastRun = (size_t)(olimit-op) - 1/*token*/; + lastRun -= (lastRun + 256 - RUN_MASK) / 256; /*additional length tokens*/ + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + } + DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRun<<ML_BITS); + } + LZ4_memcpy(op, anchor, lastRun); + ip = anchor + lastRun; + op += lastRun; + } + + if (outputDirective == fillOutput) { + *inputConsumed = (int) (((const char*)ip)-source); + } + result = (int)(((char*)op) - dest); + assert(result > 0); + DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result); + return result; +} + +/** LZ4_compress_generic() : + * inlined, to ensure branches are decided at compilation time; + * takes care of src == (NULL, 0) + * and forward the rest to LZ4_compress_generic_validated */ +LZ4_FORCE_INLINE int LZ4_compress_generic( + LZ4_stream_t_internal* const cctx, + const char* const src, + char* const dst, + const int srcSize, + int *inputConsumed, /* only written when outputDirective == fillOutput */ + const int dstCapacity, + const limitedOutput_directive outputDirective, + const tableType_t tableType, + const dict_directive dictDirective, + const dictIssue_directive dictIssue, + const int acceleration) +{ + DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i", + srcSize, dstCapacity); + + if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; } /* Unsupported srcSize, too large (or negative) */ + if (srcSize == 0) { /* src == NULL supported if srcSize == 0 */ + if (outputDirective != notLimited && dstCapacity <= 0) return 0; /* no output, can't write anything */ + DEBUGLOG(5, "Generating an empty block"); + assert(outputDirective == notLimited || dstCapacity >= 1); + assert(dst != NULL); + dst[0] = 0; + if (outputDirective == fillOutput) { + assert (inputConsumed != NULL); + *inputConsumed = 0; + } + return 1; + } + assert(src != NULL); + + return LZ4_compress_generic_validated(cctx, src, dst, srcSize, + inputConsumed, /* only written into if outputDirective == fillOutput */ + dstCapacity, outputDirective, + tableType, dictDirective, dictIssue, acceleration); +} + + +int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse; + assert(ctx != NULL); + if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; + if (maxOutputSize >= LZ4_compressBound(inputSize)) { + if (inputSize < LZ4_64Klimit) { + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + if (inputSize < LZ4_64Klimit) { + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } +} + +/** + * LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. It is only safe + * to call if the state buffer is known to be correctly initialized already + * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of + * "correctly initialized"). + */ +int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration) +{ + LZ4_stream_t_internal* const ctx = &((LZ4_stream_t*)state)->internal_donotuse; + if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; + assert(ctx != NULL); + + if (dstCapacity >= LZ4_compressBound(srcSize)) { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } +} + + +int LZ4_compress_fast(const char* src, char* dest, int srcSize, int dstCapacity, int acceleration) +{ + int result; +#if (LZ4_HEAPMODE) + LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctxPtr == NULL) return 0; +#else + LZ4_stream_t ctx; + LZ4_stream_t* const ctxPtr = &ctx; +#endif + result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity, acceleration); + +#if (LZ4_HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity) +{ + return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1); +} + + +/* Note!: This function leaves the stream in an unclean/broken state! + * It is not safe to subsequently use the same state with a _fastReset() or + * _continue() call without resetting it. */ +static int LZ4_compress_destSize_extState_internal(LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration) +{ + void* const s = LZ4_initStream(state, sizeof (*state)); + assert(s != NULL); (void)s; + + if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, acceleration); + } else { + if (*srcSizePtr < LZ4_64Klimit) { + return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, acceleration); + } else { + tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, acceleration); + } } +} + +int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration) +{ + int const r = LZ4_compress_destSize_extState_internal((LZ4_stream_t*)state, src, dst, srcSizePtr, targetDstSize, acceleration); + /* clean the state on exit */ + LZ4_initStream(state, sizeof (LZ4_stream_t)); + return r; +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (LZ4_HEAPMODE) + LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctx == NULL) return 0; +#else + LZ4_stream_t ctxBody; + LZ4_stream_t* const ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState_internal(ctx, src, dst, srcSizePtr, targetDstSize, 1); + +#if (LZ4_HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/*-****************************** +* Streaming functions +********************************/ + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); + LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal)); + DEBUGLOG(4, "LZ4_createStream %p", lz4s); + if (lz4s == NULL) return NULL; + LZ4_initStream(lz4s, sizeof(*lz4s)); + return lz4s; +} +#endif + +static size_t LZ4_stream_t_alignment(void) +{ +#if LZ4_ALIGN_TEST + typedef struct { char c; LZ4_stream_t t; } t_a; + return sizeof(t_a) - sizeof(LZ4_stream_t); +#else + return 1; /* effectively disabled */ +#endif +} + +LZ4_stream_t* LZ4_initStream (void* buffer, size_t size) +{ + DEBUGLOG(5, "LZ4_initStream"); + if (buffer == NULL) { return NULL; } + if (size < sizeof(LZ4_stream_t)) { return NULL; } + if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL; + MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal)); + return (LZ4_stream_t*)buffer; +} + +/* resetStream is now deprecated, + * prefer initStream() which is more general */ +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream); + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal)); +} + +void LZ4_resetStream_fast(LZ4_stream_t* ctx) { + LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32); +} + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + if (!LZ4_stream) return 0; /* support free on NULL */ + DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream); + FREEMEM(LZ4_stream); + return (0); +} +#endif + + +typedef enum { _ld_fast, _ld_slow } LoadDict_mode_e; +#define HASH_UNIT sizeof(reg_t) +int LZ4_loadDict_internal(LZ4_stream_t* LZ4_dict, + const char* dictionary, int dictSize, + LoadDict_mode_e _ld) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + const tableType_t tableType = byU32; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + U32 idx32; + + DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict); + + /* It's necessary to reset the context, + * and not just continue it with prepareTable() + * to avoid any risk of generating overflowing matchIndex + * when compressing using this dictionary */ + LZ4_resetStream(LZ4_dict); + + /* We always increment the offset by 64 KB, since, if the dict is longer, + * we truncate it to the last 64k, and if it's shorter, we still want to + * advance by a whole window length so we can provide the guarantee that + * there are only valid offsets in the window, which allows an optimization + * in LZ4_compress_fast_continue() where it uses noDictIssue even when the + * dictionary isn't a full 64k. */ + dict->currentOffset += 64 KB; + + if (dictSize < (int)HASH_UNIT) { + return 0; + } + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->tableType = (U32)tableType; + idx32 = dict->currentOffset - dict->dictSize; + + while (p <= dictEnd-HASH_UNIT) { + U32 const h = LZ4_hashPosition(p, tableType); + /* Note: overwriting => favors positions end of dictionary */ + LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType); + p+=3; idx32+=3; + } + + if (_ld == _ld_slow) { + /* Fill hash table with additional references, to improve compression capability */ + p = dict->dictionary; + idx32 = dict->currentOffset - dict->dictSize; + while (p <= dictEnd-HASH_UNIT) { + U32 const h = LZ4_hashPosition(p, tableType); + U32 const limit = dict->currentOffset - 64 KB; + if (LZ4_getIndexOnHash(h, dict->hashTable, tableType) <= limit) { + /* Note: not overwriting => favors positions beginning of dictionary */ + LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType); + } + p++; idx32++; + } + } + + return (int)dict->dictSize; +} + +int LZ4_loadDict(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_fast); +} + +int LZ4_loadDictSlow(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_slow); +} + +void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream) +{ + const LZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL : + &(dictionaryStream->internal_donotuse); + + DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)", + workingStream, dictionaryStream, + dictCtx != NULL ? dictCtx->dictSize : 0); + + if (dictCtx != NULL) { + /* If the current offset is zero, we will never look in the + * external dictionary context, since there is no value a table + * entry can take that indicate a miss. In that case, we need + * to bump the offset to something non-zero. + */ + if (workingStream->internal_donotuse.currentOffset == 0) { + workingStream->internal_donotuse.currentOffset = 64 KB; + } + + /* Don't actually attach an empty dictionary. + */ + if (dictCtx->dictSize == 0) { + dictCtx = NULL; + } + } + workingStream->internal_donotuse.dictCtx = dictCtx; +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize) +{ + assert(nextSize >= 0); + if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) { /* potential ptrdiff_t overflow (32-bits mode) */ + /* rescale hash table */ + U32 const delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + DEBUGLOG(4, "LZ4_renormDictT"); + for (i=0; i<LZ4_HASH_SIZE_U32; i++) { + if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, + const char* source, char* dest, + int inputSize, int maxOutputSize, + int acceleration) +{ + const tableType_t tableType = byU32; + LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse; + const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL; + + DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize); + + LZ4_renormDictT(streamPtr, inputSize); /* fix index overflow */ + if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; + + /* invalidate tiny dictionaries */ + if ( (streamPtr->dictSize < 4) /* tiny dictionary : not enough for a hash */ + && (dictEnd != source) /* prefix mode */ + && (inputSize > 0) /* tolerance : don't lose history, in case next invocation would use prefix mode */ + && (streamPtr->dictCtx == NULL) /* usingDictCtx */ + ) { + DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary); + /* remove dictionary existence from history, to employ faster prefix mode */ + streamPtr->dictSize = 0; + streamPtr->dictionary = (const BYTE*)source; + dictEnd = source; + } + + /* Check overlapping input/dictionary space */ + { const char* const sourceEnd = source + inputSize; + if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == source) { + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration); + else + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration); + } + + /* external dictionary mode */ + { int result; + if (streamPtr->dictCtx) { + /* We depend here on the fact that dictCtx'es (produced by + * LZ4_loadDict) guarantee that their tables contain no references + * to offsets between dictCtx->currentOffset - 64 KB and + * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe + * to use noDictIssue even when the dict isn't a full 64 KB. + */ + if (inputSize > 4 KB) { + /* For compressing large blobs, it is faster to pay the setup + * cost to copy the dictionary's tables into the active context, + * so that the compression loop is only looking into one table. + */ + LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr)); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration); + } + } else { /* small data <= 4 KB */ + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); + } + } + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force-test external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize) +{ + LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse; + int result; + + LZ4_renormDictT(streamPtr, srcSize); + + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + } + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)srcSize; + + return result; +} + + +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its memory location, + * save it into a safer place (char* safeBuffer). + * Note : no need to call LZ4_loadDict() afterwards, dictionary is immediately usable, + * one can therefore call LZ4_compress_fast_continue() right after. + * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + */ +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + + DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, safeBuffer); + + if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; } + + if (safeBuffer == NULL) assert(dictSize == 0); + if (dictSize > 0) { + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + assert(dict->dictionary); + LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize); + } + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/*-******************************* + * Decompression functions + ********************************/ + +typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; + +#undef MIN +#define MIN(a,b) ( (a) < (b) ? (a) : (b) ) + + +/* variant for decompress_unsafe() + * does not know end of input + * presumes input is well formed + * note : will consume at least one byte */ +static size_t read_long_length_no_check(const BYTE** pp) +{ + size_t b, l = 0; + do { b = **pp; (*pp)++; l += b; } while (b==255); + DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1) + return l; +} + +/* core decoder variant for LZ4_decompress_fast*() + * for legacy support only : these entry points are deprecated. + * - Presumes input is correctly formed (no defense vs malformed inputs) + * - Does not know input size (presume input buffer is "large enough") + * - Decompress a full block (only) + * @return : nb of bytes read from input. + * Note : this variant is not optimized for speed, just for maintenance. + * the goal is to remove support of decompress_fast*() variants by v2.0 +**/ +LZ4_FORCE_INLINE int +LZ4_decompress_unsafe_generic( + const BYTE* const istart, + BYTE* const ostart, + int decompressedSize, + + size_t prefixSize, + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note: =0 if dictStart==NULL */ + ) +{ + const BYTE* ip = istart; + BYTE* op = (BYTE*)ostart; + BYTE* const oend = ostart + decompressedSize; + const BYTE* const prefixStart = ostart - prefixSize; + + DEBUGLOG(5, "LZ4_decompress_unsafe_generic"); + if (dictStart == NULL) assert(dictSize == 0); + + while (1) { + /* start new sequence */ + unsigned token = *ip++; + + /* literals */ + { size_t ll = token >> ML_BITS; + if (ll==15) { + /* long literal length */ + ll += read_long_length_no_check(&ip); + } + if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */ + LZ4_memmove(op, ip, ll); /* support in-place decompression */ + op += ll; + ip += ll; + if ((size_t)(oend-op) < MFLIMIT) { + if (op==oend) break; /* end of block */ + DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op); + /* incorrect end of block : + * last match must start at least MFLIMIT==12 bytes before end of output block */ + return -1; + } } + + /* match */ + { size_t ml = token & 15; + size_t const offset = LZ4_readLE16(ip); + ip+=2; + + if (ml==15) { + /* long literal length */ + ml += read_long_length_no_check(&ip); + } + ml += MINMATCH; + + if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */ + + { const BYTE* match = op - offset; + + /* out of range */ + if (offset > (size_t)(op - prefixStart) + dictSize) { + DEBUGLOG(6, "offset out of range"); + return -1; + } + + /* check special case : extDict */ + if (offset > (size_t)(op - prefixStart)) { + /* extDict scenario */ + const BYTE* const dictEnd = dictStart + dictSize; + const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart)); + size_t const extml = (size_t)(dictEnd - extMatch); + if (extml > ml) { + /* match entirely within extDict */ + LZ4_memmove(op, extMatch, ml); + op += ml; + ml = 0; + } else { + /* match split between extDict & prefix */ + LZ4_memmove(op, extMatch, extml); + op += extml; + ml -= extml; + } + match = prefixStart; + } + + /* match copy - slow variant, supporting overlap copy */ + { size_t u; + for (u=0; u<ml; u++) { + op[u] = match[u]; + } } } + op += ml; + if ((size_t)(oend-op) < LASTLITERALS) { + DEBUGLOG(5, "invalid: match ends at distance %zi from end of block", oend-op); + /* incorrect end of block : + * last match must stop at least LASTLITERALS==5 bytes before end of output block */ + return -1; + } + } /* match */ + } /* main loop */ + return (int)(ip - istart); +} + + +/* Read the variable-length literal or match length. + * + * @ip : input pointer + * @ilimit : position after which if length is not decoded, the input is necessarily corrupted. + * @initial_check - check ip >= ipmax before start of loop. Returns initial_error if so. + * @error (output) - error code. Must be set to 0 before call. +**/ +typedef size_t Rvl_t; +static const Rvl_t rvl_error = (Rvl_t)(-1); +LZ4_FORCE_INLINE Rvl_t +read_variable_length(const BYTE** ip, const BYTE* ilimit, + int initial_check) +{ + Rvl_t s, length = 0; + assert(ip != NULL); + assert(*ip != NULL); + assert(ilimit != NULL); + if (initial_check && unlikely((*ip) >= ilimit)) { /* read limit reached */ + return rvl_error; + } + s = **ip; + (*ip)++; + length += s; + if (unlikely((*ip) > ilimit)) { /* read limit reached */ + return rvl_error; + } + /* accumulator overflow detection (32-bit mode only) */ + if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) { + return rvl_error; + } + if (likely(s != 255)) return length; + do { + s = **ip; + (*ip)++; + length += s; + if (unlikely((*ip) > ilimit)) { /* read limit reached */ + return rvl_error; + } + /* accumulator overflow detection (32-bit mode only) */ + if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) { + return rvl_error; + } + } while (s == 255); + + return length; +} + +/*! LZ4_decompress_generic() : + * This generic decompression function covers all use cases. + * It shall be instantiated several times, using different sets of directives. + * Note that it is important for performance that this function really get inlined, + * in order to remove useless branches during compilation optimization. + */ +LZ4_FORCE_INLINE int +LZ4_decompress_generic( + const char* const src, + char* const dst, + int srcSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` */ + + earlyEnd_directive partialDecoding, /* full, partial */ + dict_directive dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + if ((src == NULL) || (outputSize < 0)) { return -1; } + + { const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + outputSize; + BYTE* cpy; + + const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize; + + const int checkOffset = (dictSize < (int)(64 KB)); + + + /* Set up the "end" pointers for the shortcut. */ + const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/; + const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/; + + const BYTE* match; + size_t offset; + unsigned token; + size_t length; + + + DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize); + + /* Special cases */ + assert(lowPrefix <= op); + if (unlikely(outputSize==0)) { + /* Empty output buffer */ + if (partialDecoding) return 0; + return ((srcSize==1) && (*ip==0)) ? 0 : -1; + } + if (unlikely(srcSize==0)) { return -1; } + + /* LZ4_FAST_DEC_LOOP: + * designed for modern OoO performance cpus, + * where copying reliably 32-bytes is preferable to an unpredictable branch. + * note : fast loop may show a regression for some client arm chips. */ +#if LZ4_FAST_DEC_LOOP + if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { + DEBUGLOG(6, "move to safe decode loop"); + goto safe_decode; + } + + /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */ + DEBUGLOG(6, "using fast decode loop"); + while (1) { + /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */ + assert(oend - op >= FASTLOOP_SAFE_DISTANCE); + assert(ip < iend); + token = *ip++; + length = token >> ML_BITS; /* literal length */ + DEBUGLOG(7, "blockPos%6u: litLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length); + + /* decode literal length */ + if (length == RUN_MASK) { + size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1); + if (addl == rvl_error) { + DEBUGLOG(6, "error reading long literal length"); + goto _output_error; + } + length += addl; + if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + + /* copy literals */ + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if ((op+length>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; } + LZ4_wildCopy32(op, ip, op+length); + ip += length; op += length; + } else if (ip <= iend-(16 + 1/*max lit + offset + nextToken*/)) { + /* We don't need to check oend, since we check it once for each loop below */ + DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); + /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */ + LZ4_memcpy(op, ip, 16); + ip += length; op += length; + } else { + goto safe_literal_copy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + DEBUGLOG(6, "blockPos%6u: offset = %u", (unsigned)(op-(BYTE*)dst), (unsigned)offset); + match = op - offset; + assert(match <= op); /* overflow check */ + + /* get matchlength */ + length = token & ML_MASK; + DEBUGLOG(7, " match length token = %u (len==%u)", (unsigned)length, (unsigned)length+MINMATCH); + + if (length == ML_MASK) { + size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0); + if (addl == rvl_error) { + DEBUGLOG(5, "error reading long match length"); + goto _output_error; + } + length += addl; + length += MINMATCH; + DEBUGLOG(7, " long match length == %u", (unsigned)length); + if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */ + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + } else { + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + DEBUGLOG(7, "moving to safe_match_copy (ml==%u)", (unsigned)length); + goto safe_match_copy; + } + + /* Fastpath check: skip LZ4_wildCopy32 when true */ + if ((dict == withPrefix64k) || (match >= lowPrefix)) { + if (offset >= 8) { + assert(match >= lowPrefix); + assert(match <= op); + assert(op + 18 <= oend); + + LZ4_memcpy(op, match, 8); + LZ4_memcpy(op+8, match+8, 8); + LZ4_memcpy(op+16, match+16, 2); + op += length; + continue; + } } } + + if ( checkOffset && (unlikely(match + dictSize < lowPrefix)) ) { + DEBUGLOG(5, "Error : pos=%zi, offset=%zi => outside buffers", op-lowPrefix, op-match); + goto _output_error; + } + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + assert(dictEnd != NULL); + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) { + DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd"); + length = MIN(length, (size_t)(oend-op)); + } else { + DEBUGLOG(6, "end-of-block condition violated") + goto _output_error; + } } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + LZ4_memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) { *op++ = *copyFrom++; } + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + + /* copy match within block */ + cpy = op + length; + + assert((op <= oend) && (oend-op >= 32)); + if (unlikely(offset<16)) { + LZ4_memcpy_using_offset(op, match, cpy, offset); + } else { + LZ4_wildCopy32(op, match, cpy); + } + + op = cpy; /* wildcopy correction */ + } + safe_decode: +#endif + + /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */ + DEBUGLOG(6, "using safe decode loop"); + while (1) { + assert(ip < iend); + token = *ip++; + length = token >> ML_BITS; /* literal length */ + DEBUGLOG(7, "blockPos%6u: litLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length); + + /* A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough space, + * enter the shortcut and copy 16 bytes on behalf of the literals + * (in the fast mode, only 8 bytes can be safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes in a similar + * manner; but we ensure that there's enough space in the output for + * those 18 bytes earlier, upon entering the shortcut (in other words, + * there is a combined check for both stages). + */ + if ( (length != RUN_MASK) + /* strictly "less than" on input, to re-enter the loop with at least one byte */ + && likely((ip < shortiend) & (op <= shortoend)) ) { + /* Copy the literals */ + LZ4_memcpy(op, ip, 16); + op += length; ip += length; + + /* The second stage: prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. */ + length = token & ML_MASK; /* match length */ + DEBUGLOG(7, "blockPos%6u: matchLength token = %u (len=%u)", (unsigned)(op-(BYTE*)dst), (unsigned)length, (unsigned)length + 4); + offset = LZ4_readLE16(ip); ip += 2; + match = op - offset; + assert(match <= op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ( (length != ML_MASK) + && (offset >= 8) + && (dict==withPrefix64k || match >= lowPrefix) ) { + /* Copy the match. */ + LZ4_memcpy(op + 0, match + 0, 8); + LZ4_memcpy(op + 8, match + 8, 8); + LZ4_memcpy(op +16, match +16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* The second stage didn't work out, but the info is ready. + * Propel it right to the point of match copying. */ + goto _copy_match; + } + + /* decode literal length */ + if (length == RUN_MASK) { + size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1); + if (addl == rvl_error) { goto _output_error; } + length += addl; + if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + } + +#if LZ4_FAST_DEC_LOOP + safe_literal_copy: +#endif + /* copy literals */ + cpy = op+length; + + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) { + /* We've either hit the input parsing restriction or the output parsing restriction. + * In the normal scenario, decoding a full block, it must be the last sequence, + * otherwise it's an error (invalid input or dimensions). + * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow. + */ + if (partialDecoding) { + /* Since we are partial decoding we may be in this block because of the output parsing + * restriction, which is not valid since the output buffer is allowed to be undersized. + */ + DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end") + DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length); + DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op)); + DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip)); + /* Finishing in the middle of a literals segment, + * due to lack of input. + */ + if (ip+length > iend) { + length = (size_t)(iend-ip); + cpy = op + length; + } + /* Finishing in the middle of a literals segment, + * due to lack of output space. + */ + if (cpy > oend) { + cpy = oend; + assert(op<=oend); + length = (size_t)(oend-op); + } + } else { + /* We must be on the last sequence (or invalid) because of the parsing limitations + * so check that we exactly consume the input and don't overrun the output buffer. + */ + if ((ip+length != iend) || (cpy > oend)) { + DEBUGLOG(5, "should have been last run of literals") + DEBUGLOG(5, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend); + DEBUGLOG(5, "or cpy(%p) > (oend-MFLIMIT)(%p)", cpy, oend-MFLIMIT); + DEBUGLOG(5, "after writing %u bytes / %i bytes available", (unsigned)(op-(BYTE*)dst), outputSize); + goto _output_error; + } + } + LZ4_memmove(op, ip, length); /* supports overlapping memory regions, for in-place decompression scenarios */ + ip += length; + op += length; + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) { + break; + } + } else { + LZ4_wildCopy8(op, ip, cpy); /* can overwrite up to 8 bytes beyond cpy */ + ip += length; op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + DEBUGLOG(7, "blockPos%6u: matchLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length); + + _copy_match: + if (length == ML_MASK) { + size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0); + if (addl == rvl_error) { goto _output_error; } + length += addl; + if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + +#if LZ4_FAST_DEC_LOOP + safe_match_copy: +#endif + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + assert(dictEnd != NULL); + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) length = MIN(length, (size_t)(oend-op)); + else goto _output_error; /* doesn't respect parsing restriction */ + } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + LZ4_memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + assert(match >= lowPrefix); + + /* copy match within block */ + cpy = op + length; + + /* partialDecoding : may end anywhere within the block */ + assert(op<=oend); + if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + size_t const mlen = MIN(length, (size_t)(oend-op)); + const BYTE* const matchEnd = match + mlen; + BYTE* const copyEnd = op + mlen; + if (matchEnd > op) { /* overlap copy */ + while (op < copyEnd) { *op++ = *match++; } + } else { + LZ4_memcpy(op, match, mlen); + } + op = copyEnd; + if (op == oend) { break; } + continue; + } + + if (unlikely(offset<8)) { + LZ4_write32(op, 0); /* silence msan warning when offset==0 */ + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + LZ4_memcpy(op+4, match, 4); + match -= dec64table[offset]; + } else { + LZ4_memcpy(op, match, 8); + match += 8; + } + op += 8; + + if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1); + if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ + if (op < oCopyLimit) { + LZ4_wildCopy8(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) { *op++ = *match++; } + } else { + LZ4_memcpy(op, match, 8); + if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); } + } + op = cpy; /* wildcopy correction */ + } + + /* end of decoding */ + DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst)); + return (int) (((char*)op)-dst); /* Nb of output bytes decoded */ + + /* Overflow error detected */ + _output_error: + return (int) (-(((const char*)ip)-src))-1; + } +} + + +/*===== Instantiate the API decoding functions. =====*/ + +LZ4_FORCE_O2 +int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, + decode_full_block, noDict, + (BYTE*)dest, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity, + partial_decode, + noDict, (BYTE*)dst, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_fast(const char* source, char* dest, int originalSize) +{ + DEBUGLOG(5, "LZ4_decompress_fast"); + return LZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + 0, NULL, 0); +} + +/*===== Instantiate a few more decoding cases, used more than once. =====*/ + +LZ4_FORCE_O2 /* Exported, an obsolete API function. */ +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + decode_full_block, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +/* Another obsolete API function, paired with the previous one. */ +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + 64 KB, NULL, 0); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize, + size_t prefixSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + decode_full_block, noDict, + (BYTE*)dest-prefixSize, NULL, 0); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, + size_t prefixSize) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, noDict, + (BYTE*)dest-prefixSize, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, + int compressedSize, int maxOutputSize, + const void* dictStart, size_t dictSize) +{ + DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict"); + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + decode_full_block, usingExtDict, + (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest, + int compressedSize, int targetOutputSize, int dstCapacity, + const void* dictStart, size_t dictSize) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, usingExtDict, + (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize, + const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + 0, (const BYTE*)dictStart, dictSize); +} + +/* The "double dictionary" mode, for use with e.g. ring buffers: the first part + * of the dictionary is passed as prefix, and the second via dictStart + dictSize. + * These routines are used only once, in LZ4_decompress_*_continue(). + */ +LZ4_FORCE_INLINE +int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize, + size_t prefixSize, const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + decode_full_block, usingExtDict, + (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); +} + +/*===== streaming decompression functions =====*/ + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4_streamDecode_t* LZ4_createStreamDecode(void) +{ + LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= sizeof(LZ4_streamDecode_t_internal)); + return (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t)); +} + +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream) +{ + if (LZ4_stream == NULL) { return 0; } /* support free on NULL */ + FREEMEM(LZ4_stream); + return 0; +} +#endif + +/*! LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * This function is not necessary if previous data is still available where it was decoded. + * Loading a size of 0 is allowed (same effect as no dictionary). + * @return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + lz4sd->prefixSize = (size_t)dictSize; + if (dictSize) { + assert(dictionary != NULL); + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + } else { + lz4sd->prefixEnd = (const BYTE*) dictionary; + } + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/*! LZ4_decoderRingBufferSize() : + * when setting a ring buffer for streaming decompression (optional scenario), + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * Note : in a ring buffer scenario, + * blocks are presumed decompressed next to each other. + * When not enough space remains for next block (remainingSize < maxBlockSize), + * decoding resumes from beginning of ring buffer. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +int LZ4_decoderRingBufferSize(int maxBlockSize) +{ + if (maxBlockSize < 0) return 0; + if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0; + if (maxBlockSize < 16) maxBlockSize = 16; + return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize); +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +LZ4_FORCE_O2 +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixSize == 0) { + /* The first call, no dictionary yet. */ + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + /* They're rolling the current segment. */ + if (lz4sd->prefixSize >= 64 KB - 1) + result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); + else if (lz4sd->extDictSize == 0) + result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, + lz4sd->prefixSize); + else + result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize, + lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += (size_t)result; + lz4sd->prefixEnd += result; + } else { + /* The buffer wraps around, or they're switching to another buffer. */ + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +LZ4_FORCE_O2 int +LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, + const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* const lz4sd = + (assert(LZ4_streamDecode!=NULL), &LZ4_streamDecode->internal_donotuse); + int result; + + DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize); + assert(originalSize >= 0); + + if (lz4sd->prefixSize == 0) { + DEBUGLOG(5, "first invocation : no prefix nor extDict"); + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_fast(source, dest, originalSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + DEBUGLOG(5, "continue using existing prefix"); + result = LZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + lz4sd->prefixSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += (size_t)originalSize; + lz4sd->prefixEnd += originalSize; + } else { + DEBUGLOG(5, "prefix becomes extDict"); + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_fast_extDict(source, dest, originalSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (dictStart+dictSize == dest) { + if (dictSize >= 64 KB - 1) { + return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize); +} + +int LZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity); + if (dictStart+dictSize == dest) { + if (dictSize >= 64 KB - 1) { + return LZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + if (dictSize==0 || dictStart+dictSize == dest) + return LZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + (size_t)dictSize, NULL, 0); + assert(dictSize >= 0); + return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize); +} + + +/*=************************************************* +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_default(source, dest, inputSize, maxOutputSize); +} +int LZ4_compress(const char* src, char* dest, int srcSize) +{ + return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize)); +} +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); +} +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); +} +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity) +{ + return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1); +} +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) +{ + return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); +} + +/* +These decompression functions are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) +{ + return LZ4_decompress_fast(source, dest, outputSize); +} +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) +{ + return LZ4_decompress_safe(source, dest, isize, maxOutputSize); +} + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); } + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + (void)inputBuffer; + LZ4_resetStream((LZ4_stream_t*)state); + return 0; +} + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +void* LZ4_create (char* inputBuffer) +{ + (void)inputBuffer; + return LZ4_createStream(); +} +#endif + +char* LZ4_slideInputBuffer (void* state) +{ + /* avoid const char * -> char * conversion warning */ + return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary; +} + +#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/driver/dist/vendor/lz4/lz4.h b/driver/dist/vendor/lz4/lz4.h @@ -0,0 +1,884 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-2023, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef LZ4_H_2983827168210 +#define LZ4_H_2983827168210 + +/* --- Dependency --- */ +#include <stddef.h> /* size_t */ + + +/** + Introduction + + LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core, + scalable with multi-cores CPU. It features an extremely fast decoder, with speed in + multiple GB/s per core, typically reaching RAM speed limits on multi-core systems. + + The LZ4 compression library provides in-memory compression and decompression functions. + It gives full buffer control to user. + Compression can be done in: + - a single step (described as Simple Functions) + - a single step, reusing a context (described in Advanced Functions) + - unbounded multiple steps (described as Streaming compression) + + lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md). + Decompressing such a compressed block requires additional metadata. + Exact metadata depends on exact decompression function. + For the typical case of LZ4_decompress_safe(), + metadata includes block's compressed size, and maximum bound of decompressed size. + Each application is free to encode and pass such metadata in whichever way it wants. + + lz4.h only handle blocks, it can not generate Frames. + + Blocks are different from Frames (doc/lz4_Frame_format.md). + Frames bundle both blocks and metadata in a specified manner. + Embedding metadata is required for compressed data to be self-contained and portable. + Frame format is delivered through a companion API, declared in lz4frame.h. + The `lz4` CLI can only manage frames. +*/ + +/*^*************************************************************** +* Export parameters +*****************************************************************/ +/* +* LZ4_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +* LZ4LIB_VISIBILITY : +* Control library symbols visibility. +*/ +#ifndef LZ4LIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define LZ4LIB_VISIBILITY +# endif +#endif +#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1) +# define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY +#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1) +# define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define LZ4LIB_API LZ4LIB_VISIBILITY +#endif + +/*! LZ4_FREESTANDING : + * When this macro is set to 1, it enables "freestanding mode" that is + * suitable for typical freestanding environment which doesn't support + * standard C library. + * + * - LZ4_FREESTANDING is a compile-time switch. + * - It requires the following macros to be defined: + * LZ4_memcpy, LZ4_memmove, LZ4_memset. + * - It only enables LZ4/HC functions which don't use heap. + * All LZ4F_* functions are not supported. + * - See tests/freestanding.c to check its basic setup. + */ +#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1) +# define LZ4_HEAPMODE 0 +# define LZ4HC_HEAPMODE 0 +# define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1 +# if !defined(LZ4_memcpy) +# error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'." +# endif +# if !defined(LZ4_memset) +# error "LZ4_FREESTANDING requires macro 'LZ4_memset'." +# endif +# if !defined(LZ4_memmove) +# error "LZ4_FREESTANDING requires macro 'LZ4_memmove'." +# endif +#elif ! defined(LZ4_FREESTANDING) +# define LZ4_FREESTANDING 0 +#endif + + +/*------ Version ------*/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR 10 /* for new (non-breaking) interface capabilities */ +#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ + +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) + +#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE +#define LZ4_QUOTE(str) #str +#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str) +#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) /* requires v1.7.3+ */ + +LZ4LIB_API int LZ4_versionNumber (void); /**< library version number; useful to check dll version; requires v1.3.0+ */ +LZ4LIB_API const char* LZ4_versionString (void); /**< library version string; useful to check dll version; requires v1.7.5+ */ + + +/*-************************************ +* Tuning memory usage +**************************************/ +/*! + * LZ4_MEMORY_USAGE : + * Can be selected at compile time, by setting LZ4_MEMORY_USAGE. + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB) + * Increasing memory usage improves compression ratio, generally at the cost of speed. + * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality. + * Default value is 14, for 16KB, which nicely fits into most L1 caches. + */ +#ifndef LZ4_MEMORY_USAGE +# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT +#endif + +/* These are absolute limits, they should not be changed by users */ +#define LZ4_MEMORY_USAGE_MIN 10 +#define LZ4_MEMORY_USAGE_DEFAULT 14 +#define LZ4_MEMORY_USAGE_MAX 20 + +#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN) +# error "LZ4_MEMORY_USAGE is too small !" +#endif + +#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX) +# error "LZ4_MEMORY_USAGE is too large !" +#endif + +/*-************************************ +* Simple Functions +**************************************/ +/*! LZ4_compress_default() : + * Compresses 'srcSize' bytes from buffer 'src' + * into already allocated 'dst' buffer of size 'dstCapacity'. + * Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize). + * It also runs faster, so it's a recommended setting. + * If the function cannot compress 'src' into a more limited 'dst' budget, + * compression stops *immediately*, and the function result is zero. + * In which case, 'dst' content is undefined (invalid). + * srcSize : max supported value is LZ4_MAX_INPUT_SIZE. + * dstCapacity : size of buffer 'dst' (which must be already allocated) + * @return : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity) + * or 0 if compression fails + * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer). + */ +LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity); + +/*! LZ4_decompress_safe() : + * @compressedSize : is the exact complete size of the compressed block. + * @dstCapacity : is the size of destination buffer (which must be already allocated), + * presumed an upper bound of decompressed size. + * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity) + * If destination buffer is not large enough, decoding will stop and output an error code (negative value). + * If the source stream is detected malformed, the function will stop decoding and return a negative result. + * Note 1 : This function is protected against malicious data packets : + * it will never writes outside 'dst' buffer, nor read outside 'source' buffer, + * even if the compressed block is maliciously modified to order the decoder to do these actions. + * In such case, the decoder stops immediately, and considers the compressed block malformed. + * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them. + * The implementation is free to send / store / derive this information in whichever way is most beneficial. + * If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead. + */ +LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity); + + +/*-************************************ +* Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/*! LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) + This function is primarily useful for memory allocation purposes (destination buffer size). + Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). + Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize) + inputSize : max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is incorrect (too large or negative) +*/ +LZ4LIB_API int LZ4_compressBound(int inputSize); + +/*! LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows selection of "acceleration" factor. + The larger the acceleration value, the faster the algorithm, but also the lesser the compression. + It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. + An acceleration value of "1" is the same as regular LZ4_compress_default() + Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c). + Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c). +*/ +LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + + +/*! LZ4_compress_fast_extState() : + * Same as LZ4_compress_fast(), using an externally allocated memory space for its state. + * Use LZ4_sizeofState() to know how much memory must be allocated, + * and allocate it on 8-bytes boundaries (using `malloc()` typically). + * Then, provide this buffer as `void* state` to compression function. + */ +LZ4LIB_API int LZ4_sizeofState(void); +LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_compress_destSize() : + * Reverse the logic : compresses as much data as possible from 'src' buffer + * into already allocated buffer 'dst', of size >= 'dstCapacity'. + * This function either compresses the entire 'src' content into 'dst' if it's large enough, + * or fill 'dst' buffer completely with as much data as possible from 'src'. + * note: acceleration parameter is fixed to "default". + * + * *srcSizePtr : in+out parameter. Initially contains size of input. + * Will be modified to indicate how many bytes where read from 'src' to fill 'dst'. + * New value is necessarily <= input value. + * @return : Nb bytes written into 'dst' (necessarily <= dstCapacity) + * or 0 if compression fails. + * + * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+): + * the produced compressed content could, in specific circumstances, + * require to be decompressed into a destination buffer larger + * by at least 1 byte than the content to decompress. + * If an application uses `LZ4_compress_destSize()`, + * it's highly recommended to update liblz4 to v1.9.2 or better. + * If this can't be done or ensured, + * the receiving decompression function should provide + * a dstCapacity which is > decompressedSize, by at least 1 byte. + * See https://github.com/lz4/lz4/issues/859 for details + */ +LZ4LIB_API int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize); + +/*! LZ4_decompress_safe_partial() : + * Decompress an LZ4 compressed block, of size 'srcSize' at position 'src', + * into destination buffer 'dst' of size 'dstCapacity'. + * Up to 'targetOutputSize' bytes will be decoded. + * The function stops decoding on reaching this objective. + * This can be useful to boost performance + * whenever only the beginning of a block is required. + * + * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize) + * If source stream is detected malformed, function returns a negative result. + * + * Note 1 : @return can be < targetOutputSize, if compressed block contains less data. + * + * Note 2 : targetOutputSize must be <= dstCapacity + * + * Note 3 : this function effectively stops decoding on reaching targetOutputSize, + * so dstCapacity is kind of redundant. + * This is because in older versions of this function, + * decoding operation would still write complete sequences. + * Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize, + * it could write more bytes, though only up to dstCapacity. + * Some "margin" used to be required for this operation to work properly. + * Thankfully, this is no longer necessary. + * The function nonetheless keeps the same signature, in an effort to preserve API compatibility. + * + * Note 4 : If srcSize is the exact size of the block, + * then targetOutputSize can be any value, + * including larger than the block's decompressed size. + * The function will, at most, generate block's decompressed size. + * + * Note 5 : If srcSize is _larger_ than block's compressed size, + * then targetOutputSize **MUST** be <= block's decompressed size. + * Otherwise, *silent corruption will occur*. + */ +LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity); + + +/*-********************************************* +* Streaming Compression Functions +***********************************************/ +typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */ + +/*! + Note about RC_INVOKED + + - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio). + https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros + + - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars) + and reports warning "RC4011: identifier truncated". + + - To eliminate the warning, we surround long preprocessor symbol with + "#if !defined(RC_INVOKED) ... #endif" block that means + "skip this block when rc.exe is trying to read it". +*/ +#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */ +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4LIB_API LZ4_stream_t* LZ4_createStream(void); +LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr); +#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ +#endif + +/*! LZ4_resetStream_fast() : v1.9.0+ + * Use this to prepare an LZ4_stream_t for a new chain of dependent blocks + * (e.g., LZ4_compress_fast_continue()). + * + * An LZ4_stream_t must be initialized once before usage. + * This is automatically done when created by LZ4_createStream(). + * However, should the LZ4_stream_t be simply declared on stack (for example), + * it's necessary to initialize it first, using LZ4_initStream(). + * + * After init, start any new stream with LZ4_resetStream_fast(). + * A same LZ4_stream_t can be re-used multiple times consecutively + * and compress multiple streams, + * provided that it starts each new stream with LZ4_resetStream_fast(). + * + * LZ4_resetStream_fast() is much faster than LZ4_initStream(), + * but is not compatible with memory regions containing garbage data. + * + * Note: it's only useful to call LZ4_resetStream_fast() + * in the context of streaming compression. + * The *extState* functions perform their own resets. + * Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive. + */ +LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr); + +/*! LZ4_loadDict() : + * Use this function to reference a static dictionary into LZ4_stream_t. + * The dictionary must remain available during compression. + * LZ4_loadDict() triggers a reset, so any previous data will be forgotten. + * The same dictionary will have to be loaded on decompression side for successful decoding. + * Dictionary are useful for better compression of small data (KB range). + * While LZ4 itself accepts any input as dictionary, dictionary efficiency is also a topic. + * When in doubt, employ the Zstandard's Dictionary Builder. + * Loading a size of 0 is allowed, and is the same as reset. + * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded) + */ +LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/*! LZ4_loadDictSlow() : v1.10.0+ + * Same as LZ4_loadDict(), + * but uses a bit more cpu to reference the dictionary content more thoroughly. + * This is expected to slightly improve compression ratio. + * The extra-cpu cost is likely worth it if the dictionary is re-used across multiple sessions. + * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded) + */ +LZ4LIB_API int LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/*! LZ4_attach_dictionary() : stable since v1.10.0 + * + * This allows efficient re-use of a static dictionary multiple times. + * + * Rather than re-loading the dictionary buffer into a working context before + * each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a + * working LZ4_stream_t, this function introduces a no-copy setup mechanism, + * in which the working stream references @dictionaryStream in-place. + * + * Several assumptions are made about the state of @dictionaryStream. + * Currently, only states which have been prepared by LZ4_loadDict() or + * LZ4_loadDictSlow() should be expected to work. + * + * Alternatively, the provided @dictionaryStream may be NULL, + * in which case any existing dictionary stream is unset. + * + * If a dictionary is provided, it replaces any pre-existing stream history. + * The dictionary contents are the only history that can be referenced and + * logically immediately precede the data compressed in the first subsequent + * compression call. + * + * The dictionary will only remain attached to the working stream through the + * first compression call, at the end of which it is cleared. + * @dictionaryStream stream (and source buffer) must remain in-place / accessible / unchanged + * through the completion of the compression session. + * + * Note: there is no equivalent LZ4_attach_*() method on the decompression side + * because there is no initialization cost, hence no need to share the cost across multiple sessions. + * To decompress LZ4 blocks using dictionary, attached or not, + * just employ the regular LZ4_setStreamDecode() for streaming, + * or the stateless LZ4_decompress_safe_usingDict() for one-shot decompression. + */ +LZ4LIB_API void +LZ4_attach_dictionary(LZ4_stream_t* workingStream, + const LZ4_stream_t* dictionaryStream); + +/*! LZ4_compress_fast_continue() : + * Compress 'src' content using data from previously compressed blocks, for better compression ratio. + * 'dst' buffer must be already allocated. + * If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * + * @return : size of compressed block + * or 0 if there is an error (typically, cannot fit into 'dst'). + * + * Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block. + * Each block has precise boundaries. + * Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata. + * It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together. + * + * Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory ! + * + * Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB. + * Make sure that buffers are separated, by at least one byte. + * This construction ensures that each block only depends on previous block. + * + * Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB. + * + * Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed. + */ +LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_saveDict() : + * If last 64KB data cannot be guaranteed to remain available at its current memory location, + * save it into a safer place (char* safeBuffer). + * This is schematically equivalent to a memcpy() followed by LZ4_loadDict(), + * but is much faster, because LZ4_saveDict() doesn't need to rebuild tables. + * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error. + */ +LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize); + + +/*-********************************************** +* Streaming Decompression Functions +* Bufferless synchronous API +************************************************/ +typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* tracking context */ + +/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() : + * creation / destruction of streaming decompression tracking context. + * A tracking context can be re-used multiple times. + */ +#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */ +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void); +LZ4LIB_API int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); +#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ +#endif + +/*! LZ4_setStreamDecode() : + * An LZ4_streamDecode_t context can be allocated once and re-used multiple times. + * Use this function to start decompression of a new stream of blocks. + * A dictionary can optionally be set. Use NULL or size 0 for a reset order. + * Dictionary is presumed stable : it must remain accessible and unmodified during next decompression. + * @return : 1 if OK, 0 if error + */ +LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); + +/*! LZ4_decoderRingBufferSize() : v1.8.2+ + * Note : in a ring buffer scenario (optional), + * blocks are presumed decompressed next to each other + * up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize), + * at which stage it resumes from beginning of ring buffer. + * When setting such a ring buffer for streaming decompression, + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize); +#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize)) /* for static allocation; maxBlockSize presumed valid */ + +/*! LZ4_decompress_safe_continue() : + * This decoding function allows decompression of consecutive blocks in "streaming" mode. + * The difference with the usual independent blocks is that + * new blocks are allowed to find references into former blocks. + * A block is an unsplittable entity, and must be presented entirely to the decompression function. + * LZ4_decompress_safe_continue() only accepts one block at a time. + * It's modeled after `LZ4_decompress_safe()` and behaves similarly. + * + * @LZ4_streamDecode : decompression state, tracking the position in memory of past data + * @compressedSize : exact complete size of one compressed block. + * @dstCapacity : size of destination buffer (which must be already allocated), + * must be an upper bound of decompressed size. + * @return : number of bytes decompressed into destination buffer (necessarily <= dstCapacity) + * If destination buffer is not large enough, decoding will stop and output an error code (negative value). + * If the source stream is detected malformed, the function will stop decoding and return a negative result. + * + * The last 64KB of previously decoded data *must* remain available and unmodified + * at the memory position where they were previously decoded. + * If less than 64KB of data has been decoded, all the data must be present. + * + * Special : if decompression side sets a ring buffer, it must respect one of the following conditions : + * - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize). + * maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes. + * In which case, encoding and decoding buffers do not need to be synchronized. + * Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize. + * - Synchronized mode : + * Decompression buffer size is _exactly_ the same as compression buffer size, + * and follows exactly same update rule (block boundaries at same positions), + * and decoding function is provided with exact decompressed size of each block (exception for last block of the stream), + * _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB). + * - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * In which case, encoding and decoding buffers do not need to be synchronized, + * and encoding ring buffer can have any size, including small ones ( < 64 KB). + * + * Whenever these conditions are not possible, + * save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression, + * then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block. +*/ +LZ4LIB_API int +LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, + const char* src, char* dst, + int srcSize, int dstCapacity); + + +/*! LZ4_decompress_safe_usingDict() : + * Works the same as + * a combination of LZ4_setStreamDecode() followed by LZ4_decompress_safe_continue() + * However, it's stateless: it doesn't need any LZ4_streamDecode_t state. + * Dictionary is presumed stable : it must remain accessible and unmodified during decompression. + * Performance tip : Decompression speed can be substantially increased + * when dst == dictStart + dictSize. + */ +LZ4LIB_API int +LZ4_decompress_safe_usingDict(const char* src, char* dst, + int srcSize, int dstCapacity, + const char* dictStart, int dictSize); + +/*! LZ4_decompress_safe_partial_usingDict() : + * Behaves the same as LZ4_decompress_safe_partial() + * with the added ability to specify a memory segment for past data. + * Performance tip : Decompression speed can be substantially increased + * when dst == dictStart + dictSize. + */ +LZ4LIB_API int +LZ4_decompress_safe_partial_usingDict(const char* src, char* dst, + int compressedSize, + int targetOutputSize, int maxOutputSize, + const char* dictStart, int dictSize); + +#endif /* LZ4_H_2983827168210 */ + + +/*^************************************* + * !!!!!! STATIC LINKING ONLY !!!!!! + ***************************************/ + +/*-**************************************************************************** + * Experimental section + * + * Symbols declared in this section must be considered unstable. Their + * signatures or semantics may change, or they may be removed altogether in the + * future. They are therefore only safe to depend on when the caller is + * statically linked against the library. + * + * To protect against unsafe usage, not only are the declarations guarded, + * the definitions are hidden by default + * when building LZ4 as a shared/dynamic library. + * + * In order to access these declarations, + * define LZ4_STATIC_LINKING_ONLY in your application + * before including LZ4's headers. + * + * In order to make their implementations accessible dynamically, you must + * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library. + ******************************************************************************/ + +#ifdef LZ4_STATIC_LINKING_ONLY + +#ifndef LZ4_STATIC_3504398509 +#define LZ4_STATIC_3504398509 + +#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS +# define LZ4LIB_STATIC_API LZ4LIB_API +#else +# define LZ4LIB_STATIC_API +#endif + + +/*! LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. + * It is only safe to call if the state buffer is known to be correctly initialized already + * (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized"). + * From a high level, the difference is that + * this function initializes the provided state with a call to something like LZ4_resetStream_fast() + * while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream(). + */ +LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_compress_destSize_extState() : introduced in v1.10.0 + * Same as LZ4_compress_destSize(), but using an externally allocated state. + * Also: exposes @acceleration + */ +int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration); + +/*! In-place compression and decompression + * + * It's possible to have input and output sharing the same buffer, + * for highly constrained memory environments. + * In both cases, it requires input to lay at the end of the buffer, + * and decompression to start at beginning of the buffer. + * Buffer size must feature some margin, hence be larger than final size. + * + * |<------------------------buffer--------------------------------->| + * |<-----------compressed data--------->| + * |<-----------decompressed size------------------>| + * |<----margin---->| + * + * This technique is more useful for decompression, + * since decompressed size is typically larger, + * and margin is short. + * + * In-place decompression will work inside any buffer + * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize). + * This presumes that decompressedSize > compressedSize. + * Otherwise, it means compression actually expanded data, + * and it would be more efficient to store such data with a flag indicating it's not compressed. + * This can happen when data is not compressible (already compressed, or encrypted). + * + * For in-place compression, margin is larger, as it must be able to cope with both + * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX, + * and data expansion, which can happen when input is not compressible. + * As a consequence, buffer size requirements are much higher, + * and memory savings offered by in-place compression are more limited. + * + * There are ways to limit this cost for compression : + * - Reduce history size, by modifying LZ4_DISTANCE_MAX. + * Note that it is a compile-time constant, so all compressions will apply this limit. + * Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX, + * so it's a reasonable trick when inputs are known to be small. + * - Require the compressor to deliver a "maximum compressed size". + * This is the `dstCapacity` parameter in `LZ4_compress*()`. + * When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail, + * in which case, the return code will be 0 (zero). + * The caller must be ready for these cases to happen, + * and typically design a backup scheme to send data uncompressed. + * The combination of both techniques can significantly reduce + * the amount of margin required for in-place compression. + * + * In-place compression can work in any buffer + * which size is >= (maxCompressedSize) + * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success. + * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX, + * so it's possible to reduce memory requirements by playing with them. + */ + +#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize) (((compressedSize) >> 8) + 32) +#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize) ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize)) /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */ + +#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at compile time */ +# define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + +#define LZ4_COMPRESS_INPLACE_MARGIN (LZ4_DISTANCE_MAX + 32) /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */ +#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize) ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN) /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */ + +#endif /* LZ4_STATIC_3504398509 */ +#endif /* LZ4_STATIC_LINKING_ONLY */ + + + +#ifndef LZ4_H_98237428734687 +#define LZ4_H_98237428734687 + +/*-************************************************************ + * Private Definitions + ************************************************************** + * Do not use these definitions directly. + * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`. + * Accessing members will expose user code to API and/or ABI break in future versions of the library. + **************************************************************/ +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) /* required as macro for static allocation */ + +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# include <stdint.h> + typedef int8_t LZ4_i8; + typedef uint8_t LZ4_byte; + typedef uint16_t LZ4_u16; + typedef uint32_t LZ4_u32; +#else + typedef signed char LZ4_i8; + typedef unsigned char LZ4_byte; + typedef unsigned short LZ4_u16; + typedef unsigned int LZ4_u32; +#endif + +/*! LZ4_stream_t : + * Never ever use below internal definitions directly ! + * These definitions are not API/ABI safe, and may change in future versions. + * If you need static allocation, declare or allocate an LZ4_stream_t object. +**/ + +typedef struct LZ4_stream_t_internal LZ4_stream_t_internal; +struct LZ4_stream_t_internal { + LZ4_u32 hashTable[LZ4_HASH_SIZE_U32]; + const LZ4_byte* dictionary; + const LZ4_stream_t_internal* dictCtx; + LZ4_u32 currentOffset; + LZ4_u32 tableType; + LZ4_u32 dictSize; + /* Implicit padding to ensure structure is aligned */ +}; + +#define LZ4_STREAM_MINSIZE ((1UL << (LZ4_MEMORY_USAGE)) + 32) /* static size, for inter-version compatibility */ +union LZ4_stream_u { + char minStateSize[LZ4_STREAM_MINSIZE]; + LZ4_stream_t_internal internal_donotuse; +}; /* previously typedef'd to LZ4_stream_t */ + + +/*! LZ4_initStream() : v1.9.0+ + * An LZ4_stream_t structure must be initialized at least once. + * This is automatically done when invoking LZ4_createStream(), + * but it's not when the structure is simply declared on stack (for example). + * + * Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t. + * It can also initialize any arbitrary buffer of sufficient size, + * and will @return a pointer of proper type upon initialization. + * + * Note : initialization fails if size and alignment conditions are not respected. + * In which case, the function will @return NULL. + * Note2: An LZ4_stream_t structure guarantees correct alignment and size. + * Note3: Before v1.9.0, use LZ4_resetStream() instead +**/ +LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* stateBuffer, size_t size); + + +/*! LZ4_streamDecode_t : + * Never ever use below internal definitions directly ! + * These definitions are not API/ABI safe, and may change in future versions. + * If you need static allocation, declare or allocate an LZ4_streamDecode_t object. +**/ +typedef struct { + const LZ4_byte* externalDict; + const LZ4_byte* prefixEnd; + size_t extDictSize; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#define LZ4_STREAMDECODE_MINSIZE 32 +union LZ4_streamDecode_u { + char minStateSize[LZ4_STREAMDECODE_MINSIZE]; + LZ4_streamDecode_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_streamDecode_t */ + + + +/*-************************************ +* Obsolete Functions +**************************************/ + +/*! Deprecation warnings + * + * Deprecated functions make the compiler generate a warning when invoked. + * This is meant to invite users to update their source code. + * Should deprecation warnings be a problem, it is generally possible to disable them, + * typically with -Wno-deprecated-declarations for gcc + * or _CRT_SECURE_NO_WARNINGS in Visual. + * + * Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS + * before including the header file. + */ +#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS +# define LZ4_DEPRECATED(message) /* disable deprecation warnings */ +#else +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define LZ4_DEPRECATED(message) [[deprecated(message)]] +# elif defined(_MSC_VER) +# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +# elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45)) +# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +# elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31) +# define LZ4_DEPRECATED(message) __attribute__((deprecated)) +# else +# pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler") +# define LZ4_DEPRECATED(message) /* disabled */ +# endif +#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */ + +/*! Obsolete compression functions (since v1.7.3) */ +LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress (const char* src, char* dest, int srcSize); +LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/*! Obsolete decompression functions (since v1.8.0) */ +LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize); +LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); + +/* Obsolete streaming functions (since v1.7.0) + * degraded functionality; do not use! + * + * In order to perform streaming compression, these functions depended on data + * that is no longer tracked in the state. They have been preserved as well as + * possible: using them will still produce a correct output. However, they don't + * actually retain any history between compression calls. The compression ratio + * achieved will therefore be no better than compressing each chunk + * independently. + */ +LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int LZ4_sizeofStreamState(void); +LZ4_DEPRECATED("Use LZ4_resetStream() instead") LZ4LIB_API int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_saveDict() instead") LZ4LIB_API char* LZ4_slideInputBuffer (void* state); + +/*! Obsolete streaming decoding functions (since v1.7.0) */ +LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); +LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); + +/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) : + * These functions used to be faster than LZ4_decompress_safe(), + * but this is no longer the case. They are now slower. + * This is because LZ4_decompress_fast() doesn't know the input size, + * and therefore must progress more cautiously into the input buffer to not read beyond the end of block. + * On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability. + * As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated. + * + * The last remaining LZ4_decompress_fast() specificity is that + * it can decompress a block without knowing its compressed size. + * Such functionality can be achieved in a more secure manner + * by employing LZ4_decompress_safe_partial(). + * + * Parameters: + * originalSize : is the uncompressed size to regenerate. + * `dst` must be already allocated, its size must be >= 'originalSize' bytes. + * @return : number of bytes read from source buffer (== compressed size). + * The function expects to finish at block's end exactly. + * If the source stream is detected malformed, the function stops decoding and returns a negative result. + * note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer. + * However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds. + * Also, since match offsets are not validated, match reads from 'src' may underflow too. + * These issues never happen if input (compressed) data is correct. + * But they may happen if input data is invalid (error or intentional tampering). + * As a consequence, use these functions in trusted environments with trusted data **only**. + */ +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial() instead") +LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider migrating towards LZ4_decompress_safe_continue() instead. " + "Note that the contract will change (requires block's compressed size, instead of decompressed size)") +LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial_usingDict() instead") +LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize); + +/*! LZ4_resetStream() : + * An LZ4_stream_t structure must be initialized at least once. + * This is done with LZ4_initStream(), or LZ4_resetStream(). + * Consider switching to LZ4_initStream(), + * invoking LZ4_resetStream() will trigger deprecation warnings in the future. + */ +LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr); + + +#endif /* LZ4_H_98237428734687 */ + + +#if defined (__cplusplus) +} +#endif diff --git a/driver/dist/vendor/monocypher/LICENCE.md b/driver/dist/vendor/monocypher/LICENCE.md @@ -0,0 +1,167 @@ +Monocypher as a whole is dual-licensed. Choose whichever licence you +want from the two licences listed below. + +The first licence is a regular 2-clause BSD licence. The second licence +is the CC-0 from Creative Commons. It is intended to release Monocypher +to the public domain. The BSD licence serves as a fallback option. + +See the individual files for specific information about who contributed +to what file during which years. See below for special notes. + +Licence 1 (2-clause BSD) +------------------------ + +Copyright (c) 2017-2023, Loup Vaillant +Copyright (c) 2017-2019, Michael Savage +Copyright (c) 2017-2023, Fabio Scotoni +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Licence 2 (CC-0) +---------------- + +> CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE +> LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN +> ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS +> INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES +> REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS +> PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM +> THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED +> HEREUNDER. + +### Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work +of authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without +fear of later claims of infringement build upon, modify, incorporate in +other works, reuse and redistribute as freely as possible in any form +whatsoever and for any purposes, including without limitation commercial +purposes. These owners may contribute to the Commons to promote the +ideal of a free culture and the further production of creative, cultural +and scientific works, or to gain reputation or greater distribution for +their Work in part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or +she is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under +its terms, with knowledge of his or her Copyright and Related Rights in +the Work and the meaning and intended legal effect of CC0 on those +rights. + +1. **Copyright and Related Rights.** A Work made available under CC0 may + be protected by copyright and related or neighboring rights + ("Copyright and Related Rights"). Copyright and Related Rights + include, but are not limited to, the following: + + - the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + - moral rights retained by the original author(s) and/or + performer(s); publicity and privacy rights pertaining to a person's + image or likeness depicted in a Work; + - rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + - rights protecting the extraction, dissemination, use and reuse of + data in a Work; + - database rights (such as those arising under Directive 96/9/EC of + the European Parliament and of the Council of 11 March 1996 on the + legal protection of databases, and under any national + implementation thereof, including any amended or successor version + of such directive); and + - other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. **Waiver.** To the greatest extent permitted by, but not in + contravention of, applicable law, Affirmer hereby overtly, fully, + permanently, irrevocably and unconditionally waives, abandons, and + surrenders all of Affirmer's Copyright and Related Rights and + associated claims and causes of action, whether now known or unknown + (including existing as well as future claims and causes of action), + in the Work (i) in all territories worldwide, (ii) for the maximum + duration provided by applicable law or treaty (including future time + extensions), (iii) in any current or future medium and for any number + of copies, and (iv) for any purpose whatsoever, including without + limitation commercial, advertising or promotional purposes (the + "Waiver"). Affirmer makes the Waiver for the benefit of each member + of the public at large and to the detriment of Affirmer's heirs and + successors, fully intending that such Waiver shall not be subject to + revocation, rescission, cancellation, termination, or any other legal + or equitable action to disrupt the quiet enjoyment of the Work by the + public as contemplated by Affirmer's express Statement of Purpose. + +3. **Public License Fallback.** Should any part of the Waiver for any + reason be judged legally invalid or ineffective under applicable law, + then the Waiver shall be preserved to the maximum extent permitted + taking into account Affirmer's express Statement of Purpose. In + addition, to the extent the Waiver is so judged Affirmer hereby + grants to each affected person a royalty-free, non transferable, non + sublicensable, non exclusive, irrevocable and unconditional license + to exercise Affirmer's Copyright and Related Rights in the Work (i) + in all territories worldwide, (ii) for the maximum duration provided + by applicable law or treaty (including future time extensions), (iii) + in any current or future medium and for any number of copies, and + (iv) for any purpose whatsoever, including without limitation + commercial, advertising or promotional purposes (the "License"). The + License shall be deemed effective as of the date CC0 was applied by + Affirmer to the Work. Should any part of the License for any reason + be judged legally invalid or ineffective under applicable law, such + partial invalidity or ineffectiveness shall not invalidate the + remainder of the License, and in such case Affirmer hereby affirms + that he or she will not (i) exercise any of his or her remaining + Copyright and Related Rights in the Work or (ii) assert any + associated claims and causes of action with respect to the Work, in + either case contrary to Affirmer's express Statement of Purpose. + +4. **Limitations and Disclaimers.** + + - No trademark or patent rights held by Affirmer are waived, + abandoned, surrendered, licensed or otherwise affected by this + document. + - Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, + or the present or absence of errors, whether or not discoverable, + all to the greatest extent permissible under applicable law. + - Affirmer disclaims responsibility for clearing rights of other + persons that may apply to the Work or any use thereof, including + without limitation any person's Copyright and Related Rights in the + Work. Further, Affirmer disclaims responsibility for obtaining any + necessary consents, permissions or other rights required for any + use of the Work. + - Affirmer understands and acknowledges that Creative Commons is not + a party to this document and has no duty or obligation with respect + to this CC0 or use of the Work. diff --git a/driver/dist/vendor/monocypher/monocypher-ed25519.c b/driver/dist/vendor/monocypher/monocypher-ed25519.c @@ -0,0 +1,500 @@ +// Monocypher version 4.0.2 +// +// This file is dual-licensed. Choose whichever licence you want from +// the two licences listed below. +// +// The first licence is a regular 2-clause BSD licence. The second licence +// is the CC-0 from Creative Commons. It is intended to release Monocypher +// to the public domain. The BSD licence serves as a fallback option. +// +// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0 +// +// ------------------------------------------------------------------------ +// +// Copyright (c) 2017-2019, Loup Vaillant +// All rights reserved. +// +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ------------------------------------------------------------------------ +// +// Written in 2017-2019 by Loup Vaillant +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along +// with this software. If not, see +// <https://creativecommons.org/publicdomain/zero/1.0/> + +#include "monocypher-ed25519.h" + +#ifdef MONOCYPHER_CPP_NAMESPACE +namespace MONOCYPHER_CPP_NAMESPACE { +#endif + +///////////////// +/// Utilities /// +///////////////// +#define FOR(i, min, max) for (size_t i = min; i < max; i++) +#define COPY(dst, src, size) FOR(_i_, 0, size) (dst)[_i_] = (src)[_i_] +#define ZERO(buf, size) FOR(_i_, 0, size) (buf)[_i_] = 0 +#define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx))) +#define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer)) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) +typedef uint8_t u8; +typedef uint64_t u64; + +// Returns the smallest positive integer y such that +// (x + y) % pow_2 == 0 +// Basically, it's how many bytes we need to add to "align" x. +// Only works when pow_2 is a power of 2. +// Note: we use ~x+1 instead of -x to avoid compiler warnings +static size_t align(size_t x, size_t pow_2) +{ + return (~x + 1) & (pow_2 - 1); +} + +static u64 load64_be(const u8 s[8]) +{ + return((u64)s[0] << 56) + | ((u64)s[1] << 48) + | ((u64)s[2] << 40) + | ((u64)s[3] << 32) + | ((u64)s[4] << 24) + | ((u64)s[5] << 16) + | ((u64)s[6] << 8) + | (u64)s[7]; +} + +static void store64_be(u8 out[8], u64 in) +{ + out[0] = (in >> 56) & 0xff; + out[1] = (in >> 48) & 0xff; + out[2] = (in >> 40) & 0xff; + out[3] = (in >> 32) & 0xff; + out[4] = (in >> 24) & 0xff; + out[5] = (in >> 16) & 0xff; + out[6] = (in >> 8) & 0xff; + out[7] = in & 0xff; +} + +static void load64_be_buf (u64 *dst, const u8 *src, size_t size) { + FOR(i, 0, size) { dst[i] = load64_be(src + i*8); } +} + +/////////////// +/// SHA 512 /// +/////////////// +static u64 rot(u64 x, int c ) { return (x >> c) | (x << (64 - c)); } +static u64 ch (u64 x, u64 y, u64 z) { return (x & y) ^ (~x & z); } +static u64 maj(u64 x, u64 y, u64 z) { return (x & y) ^ ( x & z) ^ (y & z); } +static u64 big_sigma0(u64 x) { return rot(x, 28) ^ rot(x, 34) ^ rot(x, 39); } +static u64 big_sigma1(u64 x) { return rot(x, 14) ^ rot(x, 18) ^ rot(x, 41); } +static u64 lit_sigma0(u64 x) { return rot(x, 1) ^ rot(x, 8) ^ (x >> 7); } +static u64 lit_sigma1(u64 x) { return rot(x, 19) ^ rot(x, 61) ^ (x >> 6); } + +static const u64 K[80] = { + 0x428a2f98d728ae22,0x7137449123ef65cd,0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc, + 0x3956c25bf348b538,0x59f111f1b605d019,0x923f82a4af194f9b,0xab1c5ed5da6d8118, + 0xd807aa98a3030242,0x12835b0145706fbe,0x243185be4ee4b28c,0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f,0x80deb1fe3b1696b1,0x9bdc06a725c71235,0xc19bf174cf692694, + 0xe49b69c19ef14ad2,0xefbe4786384f25e3,0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65, + 0x2de92c6f592b0275,0x4a7484aa6ea6e483,0x5cb0a9dcbd41fbd4,0x76f988da831153b5, + 0x983e5152ee66dfab,0xa831c66d2db43210,0xb00327c898fb213f,0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2,0xd5a79147930aa725,0x06ca6351e003826f,0x142929670a0e6e70, + 0x27b70a8546d22ffc,0x2e1b21385c26c926,0x4d2c6dfc5ac42aed,0x53380d139d95b3df, + 0x650a73548baf63de,0x766a0abb3c77b2a8,0x81c2c92e47edaee6,0x92722c851482353b, + 0xa2bfe8a14cf10364,0xa81a664bbc423001,0xc24b8b70d0f89791,0xc76c51a30654be30, + 0xd192e819d6ef5218,0xd69906245565a910,0xf40e35855771202a,0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8,0x1e376c085141ab53,0x2748774cdf8eeb99,0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb,0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc,0x78a5636f43172f60,0x84c87814a1f0ab72,0x8cc702081a6439ec, + 0x90befffa23631e28,0xa4506cebde82bde9,0xbef9a3f7b2c67915,0xc67178f2e372532b, + 0xca273eceea26619c,0xd186b8c721c0c207,0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178, + 0x06f067aa72176fba,0x0a637dc5a2c898a6,0x113f9804bef90dae,0x1b710b35131c471b, + 0x28db77f523047d84,0x32caab7b40c72493,0x3c9ebe0a15c9bebc,0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6,0x597f299cfc657e2a,0x5fcb6fab3ad6faec,0x6c44198c4a475817 +}; + +static void sha512_compress(crypto_sha512_ctx *ctx) +{ + u64 a = ctx->hash[0]; u64 b = ctx->hash[1]; + u64 c = ctx->hash[2]; u64 d = ctx->hash[3]; + u64 e = ctx->hash[4]; u64 f = ctx->hash[5]; + u64 g = ctx->hash[6]; u64 h = ctx->hash[7]; + + FOR (j, 0, 16) { + u64 in = K[j] + ctx->input[j]; + u64 t1 = big_sigma1(e) + ch (e, f, g) + h + in; + u64 t2 = big_sigma0(a) + maj(a, b, c); + h = g; g = f; f = e; e = d + t1; + d = c; c = b; b = a; a = t1 + t2; + } + size_t i16 = 0; + FOR(i, 1, 5) { + i16 += 16; + FOR (j, 0, 16) { + ctx->input[j] += lit_sigma1(ctx->input[(j- 2) & 15]); + ctx->input[j] += lit_sigma0(ctx->input[(j-15) & 15]); + ctx->input[j] += ctx->input[(j- 7) & 15]; + u64 in = K[i16 + j] + ctx->input[j]; + u64 t1 = big_sigma1(e) + ch (e, f, g) + h + in; + u64 t2 = big_sigma0(a) + maj(a, b, c); + h = g; g = f; f = e; e = d + t1; + d = c; c = b; b = a; a = t1 + t2; + } + } + + ctx->hash[0] += a; ctx->hash[1] += b; + ctx->hash[2] += c; ctx->hash[3] += d; + ctx->hash[4] += e; ctx->hash[5] += f; + ctx->hash[6] += g; ctx->hash[7] += h; +} + +// Write 1 input byte +static void sha512_set_input(crypto_sha512_ctx *ctx, u8 input) +{ + size_t word = ctx->input_idx >> 3; + size_t byte = ctx->input_idx & 7; + ctx->input[word] |= (u64)input << (8 * (7 - byte)); +} + +// Increment a 128-bit "word". +static void sha512_incr(u64 x[2], u64 y) +{ + x[1] += y; + if (x[1] < y) { + x[0]++; + } +} + +void crypto_sha512_init(crypto_sha512_ctx *ctx) +{ + ctx->hash[0] = 0x6a09e667f3bcc908; + ctx->hash[1] = 0xbb67ae8584caa73b; + ctx->hash[2] = 0x3c6ef372fe94f82b; + ctx->hash[3] = 0xa54ff53a5f1d36f1; + ctx->hash[4] = 0x510e527fade682d1; + ctx->hash[5] = 0x9b05688c2b3e6c1f; + ctx->hash[6] = 0x1f83d9abfb41bd6b; + ctx->hash[7] = 0x5be0cd19137e2179; + ctx->input_size[0] = 0; + ctx->input_size[1] = 0; + ctx->input_idx = 0; + ZERO(ctx->input, 16); +} + +void crypto_sha512_update(crypto_sha512_ctx *ctx, + const u8 *message, size_t message_size) +{ + // Avoid undefined NULL pointer increments with empty messages + if (message_size == 0) { + return; + } + + // Align ourselves with word boundaries + if ((ctx->input_idx & 7) != 0) { + size_t nb_bytes = MIN(align(ctx->input_idx, 8), message_size); + FOR (i, 0, nb_bytes) { + sha512_set_input(ctx, message[i]); + ctx->input_idx++; + } + message += nb_bytes; + message_size -= nb_bytes; + } + + // Align ourselves with block boundaries + if ((ctx->input_idx & 127) != 0) { + size_t nb_words = MIN(align(ctx->input_idx, 128), message_size) >> 3; + load64_be_buf(ctx->input + (ctx->input_idx >> 3), message, nb_words); + ctx->input_idx += nb_words << 3; + message += nb_words << 3; + message_size -= nb_words << 3; + } + + // Compress block if needed + if (ctx->input_idx == 128) { + sha512_incr(ctx->input_size, 1024); // size is in bits + sha512_compress(ctx); + ctx->input_idx = 0; + ZERO(ctx->input, 16); + } + + // Process the message block by block + FOR (i, 0, message_size >> 7) { // number of blocks + load64_be_buf(ctx->input, message, 16); + sha512_incr(ctx->input_size, 1024); // size is in bits + sha512_compress(ctx); + ctx->input_idx = 0; + ZERO(ctx->input, 16); + message += 128; + } + message_size &= 127; + + if (message_size != 0) { + // Remaining words + size_t nb_words = message_size >> 3; + load64_be_buf(ctx->input, message, nb_words); + ctx->input_idx += nb_words << 3; + message += nb_words << 3; + message_size -= nb_words << 3; + + // Remaining bytes + FOR (i, 0, message_size) { + sha512_set_input(ctx, message[i]); + ctx->input_idx++; + } + } +} + +void crypto_sha512_final(crypto_sha512_ctx *ctx, u8 hash[64]) +{ + // Add padding bit + if (ctx->input_idx == 0) { + ZERO(ctx->input, 16); + } + sha512_set_input(ctx, 128); + + // Update size + sha512_incr(ctx->input_size, ctx->input_idx * 8); + + // Compress penultimate block (if any) + if (ctx->input_idx > 111) { + sha512_compress(ctx); + ZERO(ctx->input, 14); + } + // Compress last block + ctx->input[14] = ctx->input_size[0]; + ctx->input[15] = ctx->input_size[1]; + sha512_compress(ctx); + + // Copy hash to output (big endian) + FOR (i, 0, 8) { + store64_be(hash + i*8, ctx->hash[i]); + } + + WIPE_CTX(ctx); +} + +void crypto_sha512(u8 hash[64], const u8 *message, size_t message_size) +{ + crypto_sha512_ctx ctx; + crypto_sha512_init (&ctx); + crypto_sha512_update(&ctx, message, message_size); + crypto_sha512_final (&ctx, hash); +} + +//////////////////// +/// HMAC SHA 512 /// +//////////////////// +void crypto_sha512_hmac_init(crypto_sha512_hmac_ctx *ctx, + const u8 *key, size_t key_size) +{ + // hash key if it is too long + if (key_size > 128) { + crypto_sha512(ctx->key, key, key_size); + key = ctx->key; + key_size = 64; + } + // Compute inner key: padded key XOR 0x36 + FOR (i, 0, key_size) { ctx->key[i] = key[i] ^ 0x36; } + FOR (i, key_size, 128) { ctx->key[i] = 0x36; } + // Start computing inner hash + crypto_sha512_init (&ctx->ctx); + crypto_sha512_update(&ctx->ctx, ctx->key, 128); +} + +void crypto_sha512_hmac_update(crypto_sha512_hmac_ctx *ctx, + const u8 *message, size_t message_size) +{ + crypto_sha512_update(&ctx->ctx, message, message_size); +} + +void crypto_sha512_hmac_final(crypto_sha512_hmac_ctx *ctx, u8 hmac[64]) +{ + // Finish computing inner hash + crypto_sha512_final(&ctx->ctx, hmac); + // Compute outer key: padded key XOR 0x5c + FOR (i, 0, 128) { + ctx->key[i] ^= 0x36 ^ 0x5c; + } + // Compute outer hash + crypto_sha512_init (&ctx->ctx); + crypto_sha512_update(&ctx->ctx, ctx->key , 128); + crypto_sha512_update(&ctx->ctx, hmac, 64); + crypto_sha512_final (&ctx->ctx, hmac); // outer hash + WIPE_CTX(ctx); +} + +void crypto_sha512_hmac(u8 hmac[64], const u8 *key, size_t key_size, + const u8 *message, size_t message_size) +{ + crypto_sha512_hmac_ctx ctx; + crypto_sha512_hmac_init (&ctx, key, key_size); + crypto_sha512_hmac_update(&ctx, message, message_size); + crypto_sha512_hmac_final (&ctx, hmac); +} + +//////////////////// +/// HKDF SHA 512 /// +//////////////////// +void crypto_sha512_hkdf_expand(u8 *okm, size_t okm_size, + const u8 *prk, size_t prk_size, + const u8 *info, size_t info_size) +{ + int not_first = 0; + u8 ctr = 1; + u8 blk[64]; + + while (okm_size > 0) { + size_t out_size = MIN(okm_size, sizeof(blk)); + + crypto_sha512_hmac_ctx ctx; + crypto_sha512_hmac_init(&ctx, prk , prk_size); + if (not_first) { + // For some reason HKDF uses some kind of CBC mode. + // For some reason CTR mode alone wasn't enough. + // Like what, they didn't trust HMAC in 2010? Really?? + crypto_sha512_hmac_update(&ctx, blk , sizeof(blk)); + } + crypto_sha512_hmac_update(&ctx, info, info_size); + crypto_sha512_hmac_update(&ctx, &ctr, 1); + crypto_sha512_hmac_final(&ctx, blk); + + COPY(okm, blk, out_size); + + not_first = 1; + okm += out_size; + okm_size -= out_size; + ctr++; + } +} + +void crypto_sha512_hkdf(u8 *okm , size_t okm_size, + const u8 *ikm , size_t ikm_size, + const u8 *salt, size_t salt_size, + const u8 *info, size_t info_size) +{ + // Extract + u8 prk[64]; + crypto_sha512_hmac(prk, salt, salt_size, ikm, ikm_size); + + // Expand + crypto_sha512_hkdf_expand(okm, okm_size, prk, sizeof(prk), info, info_size); +} + +/////////////// +/// Ed25519 /// +/////////////// +void crypto_ed25519_key_pair(u8 secret_key[64], u8 public_key[32], u8 seed[32]) +{ + u8 a[64]; + COPY(a, seed, 32); // a[ 0..31] = seed + crypto_wipe(seed, 32); + COPY(secret_key, a, 32); // secret key = seed + crypto_sha512(a, a, 32); // a[ 0..31] = scalar + crypto_eddsa_trim_scalar(a, a); // a[ 0..31] = trimmed scalar + crypto_eddsa_scalarbase(public_key, a); // public key = [trimmed scalar]B + COPY(secret_key + 32, public_key, 32); // secret key includes public half + WIPE_BUFFER(a); +} + +static void hash_reduce(u8 h[32], + const u8 *a, size_t a_size, + const u8 *b, size_t b_size, + const u8 *c, size_t c_size, + const u8 *d, size_t d_size) +{ + u8 hash[64]; + crypto_sha512_ctx ctx; + crypto_sha512_init (&ctx); + crypto_sha512_update(&ctx, a, a_size); + crypto_sha512_update(&ctx, b, b_size); + crypto_sha512_update(&ctx, c, c_size); + crypto_sha512_update(&ctx, d, d_size); + crypto_sha512_final (&ctx, hash); + crypto_eddsa_reduce(h, hash); +} + +static void ed25519_dom_sign(u8 signature [64], const u8 secret_key[32], + const u8 *dom, size_t dom_size, + const u8 *message, size_t message_size) +{ + u8 a[64]; // secret scalar and prefix + u8 r[32]; // secret deterministic "random" nonce + u8 h[32]; // publically verifiable hash of the message (not wiped) + u8 R[32]; // first half of the signature (allows overlapping inputs) + const u8 *pk = secret_key + 32; + + crypto_sha512(a, secret_key, 32); + crypto_eddsa_trim_scalar(a, a); + hash_reduce(r, dom, dom_size, a + 32, 32, message, message_size, 0, 0); + crypto_eddsa_scalarbase(R, r); + hash_reduce(h, dom, dom_size, R, 32, pk, 32, message, message_size); + COPY(signature, R, 32); + crypto_eddsa_mul_add(signature + 32, h, a, r); + + WIPE_BUFFER(a); + WIPE_BUFFER(r); +} + +void crypto_ed25519_sign(u8 signature [64], const u8 secret_key[64], + const u8 *message, size_t message_size) +{ + ed25519_dom_sign(signature, secret_key, 0, 0, message, message_size); +} + +int crypto_ed25519_check(const u8 signature[64], const u8 public_key[32], + const u8 *msg, size_t msg_size) +{ + u8 h_ram[32]; + hash_reduce(h_ram, signature, 32, public_key, 32, msg, msg_size, 0, 0); + return crypto_eddsa_check_equation(signature, public_key, h_ram); +} + +static const u8 domain[34] = "SigEd25519 no Ed25519 collisions\1"; + +void crypto_ed25519_ph_sign(uint8_t signature[64], const uint8_t secret_key[64], + const uint8_t message_hash[64]) +{ + ed25519_dom_sign(signature, secret_key, domain, sizeof(domain), + message_hash, 64); +} + +int crypto_ed25519_ph_check(const uint8_t sig[64], const uint8_t pk[32], + const uint8_t msg_hash[64]) +{ + u8 h_ram[32]; + hash_reduce(h_ram, domain, sizeof(domain), sig, 32, pk, 32, msg_hash, 64); + return crypto_eddsa_check_equation(sig, pk, h_ram); +} + + +#ifdef MONOCYPHER_CPP_NAMESPACE +} +#endif diff --git a/driver/dist/vendor/monocypher/monocypher-ed25519.h b/driver/dist/vendor/monocypher/monocypher-ed25519.h @@ -0,0 +1,140 @@ +// Monocypher version 4.0.2 +// +// This file is dual-licensed. Choose whichever licence you want from +// the two licences listed below. +// +// The first licence is a regular 2-clause BSD licence. The second licence +// is the CC-0 from Creative Commons. It is intended to release Monocypher +// to the public domain. The BSD licence serves as a fallback option. +// +// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0 +// +// ------------------------------------------------------------------------ +// +// Copyright (c) 2017-2019, Loup Vaillant +// All rights reserved. +// +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ------------------------------------------------------------------------ +// +// Written in 2017-2019 by Loup Vaillant +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along +// with this software. If not, see +// <https://creativecommons.org/publicdomain/zero/1.0/> + +#ifndef ED25519_H +#define ED25519_H + +#include "monocypher.h" + +#ifdef MONOCYPHER_CPP_NAMESPACE +namespace MONOCYPHER_CPP_NAMESPACE { +#elif defined(__cplusplus) +extern "C" { +#endif + +//////////////////////// +/// Type definitions /// +//////////////////////// + +// Do not rely on the size or content on any of those types, +// they may change without notice. +typedef struct { + uint64_t hash[8]; + uint64_t input[16]; + uint64_t input_size[2]; + size_t input_idx; +} crypto_sha512_ctx; + +typedef struct { + uint8_t key[128]; + crypto_sha512_ctx ctx; +} crypto_sha512_hmac_ctx; + + +// SHA 512 +// ------- +void crypto_sha512_init (crypto_sha512_ctx *ctx); +void crypto_sha512_update(crypto_sha512_ctx *ctx, + const uint8_t *message, size_t message_size); +void crypto_sha512_final (crypto_sha512_ctx *ctx, uint8_t hash[64]); +void crypto_sha512(uint8_t hash[64], + const uint8_t *message, size_t message_size); + +// SHA 512 HMAC +// ------------ +void crypto_sha512_hmac_init(crypto_sha512_hmac_ctx *ctx, + const uint8_t *key, size_t key_size); +void crypto_sha512_hmac_update(crypto_sha512_hmac_ctx *ctx, + const uint8_t *message, size_t message_size); +void crypto_sha512_hmac_final(crypto_sha512_hmac_ctx *ctx, uint8_t hmac[64]); +void crypto_sha512_hmac(uint8_t hmac[64], + const uint8_t *key , size_t key_size, + const uint8_t *message, size_t message_size); + +// SHA 512 HKDF +// ------------ +void crypto_sha512_hkdf_expand(uint8_t *okm, size_t okm_size, + const uint8_t *prk, size_t prk_size, + const uint8_t *info, size_t info_size); +void crypto_sha512_hkdf(uint8_t *okm , size_t okm_size, + const uint8_t *ikm , size_t ikm_size, + const uint8_t *salt, size_t salt_size, + const uint8_t *info, size_t info_size); + +// Ed25519 +// ------- +// Signatures (EdDSA with curve25519 + SHA-512) +// -------------------------------------------- +void crypto_ed25519_key_pair(uint8_t secret_key[64], + uint8_t public_key[32], + uint8_t seed[32]); +void crypto_ed25519_sign(uint8_t signature [64], + const uint8_t secret_key[64], + const uint8_t *message, size_t message_size); +int crypto_ed25519_check(const uint8_t signature [64], + const uint8_t public_key[32], + const uint8_t *message, size_t message_size); + +// Pre-hash variants +void crypto_ed25519_ph_sign(uint8_t signature [64], + const uint8_t secret_key [64], + const uint8_t message_hash[64]); +int crypto_ed25519_ph_check(const uint8_t signature [64], + const uint8_t public_key [32], + const uint8_t message_hash[64]); + +#ifdef __cplusplus +} +#endif + +#endif // ED25519_H diff --git a/driver/dist/vendor/monocypher/monocypher.c b/driver/dist/vendor/monocypher/monocypher.c @@ -0,0 +1,2956 @@ +// Monocypher version 4.0.2 +// +// This file is dual-licensed. Choose whichever licence you want from +// the two licences listed below. +// +// The first licence is a regular 2-clause BSD licence. The second licence +// is the CC-0 from Creative Commons. It is intended to release Monocypher +// to the public domain. The BSD licence serves as a fallback option. +// +// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0 +// +// ------------------------------------------------------------------------ +// +// Copyright (c) 2017-2020, Loup Vaillant +// All rights reserved. +// +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ------------------------------------------------------------------------ +// +// Written in 2017-2020 by Loup Vaillant +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along +// with this software. If not, see +// <https://creativecommons.org/publicdomain/zero/1.0/> + +#include "monocypher.h" + +#ifdef MONOCYPHER_CPP_NAMESPACE +namespace MONOCYPHER_CPP_NAMESPACE { +#endif + +///////////////// +/// Utilities /// +///////////////// +#define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++) +#define FOR(i, start, end) FOR_T(size_t, i, start, end) +#define COPY(dst, src, size) FOR(_i_, 0, size) (dst)[_i_] = (src)[_i_] +#define ZERO(buf, size) FOR(_i_, 0, size) (buf)[_i_] = 0 +#define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx))) +#define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer)) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) + +typedef int8_t i8; +typedef uint8_t u8; +typedef int16_t i16; +typedef uint32_t u32; +typedef int32_t i32; +typedef int64_t i64; +typedef uint64_t u64; + +static const u8 zero[128] = {0}; + +// returns the smallest positive integer y such that +// (x + y) % pow_2 == 0 +// Basically, y is the "gap" missing to align x. +// Only works when pow_2 is a power of 2. +// Note: we use ~x+1 instead of -x to avoid compiler warnings +static size_t gap(size_t x, size_t pow_2) +{ + return (~x + 1) & (pow_2 - 1); +} + +static u32 load24_le(const u8 s[3]) +{ + return + ((u32)s[0] << 0) | + ((u32)s[1] << 8) | + ((u32)s[2] << 16); +} + +static u32 load32_le(const u8 s[4]) +{ + return + ((u32)s[0] << 0) | + ((u32)s[1] << 8) | + ((u32)s[2] << 16) | + ((u32)s[3] << 24); +} + +static u64 load64_le(const u8 s[8]) +{ + return load32_le(s) | ((u64)load32_le(s+4) << 32); +} + +static void store32_le(u8 out[4], u32 in) +{ + out[0] = in & 0xff; + out[1] = (in >> 8) & 0xff; + out[2] = (in >> 16) & 0xff; + out[3] = (in >> 24) & 0xff; +} + +static void store64_le(u8 out[8], u64 in) +{ + store32_le(out , (u32)in ); + store32_le(out + 4, in >> 32); +} + +static void load32_le_buf (u32 *dst, const u8 *src, size_t size) { + FOR(i, 0, size) { dst[i] = load32_le(src + i*4); } +} +static void load64_le_buf (u64 *dst, const u8 *src, size_t size) { + FOR(i, 0, size) { dst[i] = load64_le(src + i*8); } +} +static void store32_le_buf(u8 *dst, const u32 *src, size_t size) { + FOR(i, 0, size) { store32_le(dst + i*4, src[i]); } +} +static void store64_le_buf(u8 *dst, const u64 *src, size_t size) { + FOR(i, 0, size) { store64_le(dst + i*8, src[i]); } +} + +static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); } +static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); } + +static int neq0(u64 diff) +{ + // constant time comparison to zero + // return diff != 0 ? -1 : 0 + u64 half = (diff >> 32) | ((u32)diff); + return (1 & ((half - 1) >> 32)) - 1; +} + +static u64 x16(const u8 a[16], const u8 b[16]) +{ + return (load64_le(a + 0) ^ load64_le(b + 0)) + | (load64_le(a + 8) ^ load64_le(b + 8)); +} +static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);} +static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);} +int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); } +int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); } +int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); } + +void crypto_wipe(void *secret, size_t size) +{ + volatile u8 *v_secret = (u8*)secret; + ZERO(v_secret, size); +} + +///////////////// +/// Chacha 20 /// +///////////////// +#define QUARTERROUND(a, b, c, d) \ + a += b; d = rotl32(d ^ a, 16); \ + c += d; b = rotl32(b ^ c, 12); \ + a += b; d = rotl32(d ^ a, 8); \ + c += d; b = rotl32(b ^ c, 7) + +static void chacha20_rounds(u32 out[16], const u32 in[16]) +{ + // The temporary variables make Chacha20 10% faster. + u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3]; + u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7]; + u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11]; + u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15]; + + FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop. + QUARTERROUND(t0, t4, t8 , t12); // column 0 + QUARTERROUND(t1, t5, t9 , t13); // column 1 + QUARTERROUND(t2, t6, t10, t14); // column 2 + QUARTERROUND(t3, t7, t11, t15); // column 3 + QUARTERROUND(t0, t5, t10, t15); // diagonal 0 + QUARTERROUND(t1, t6, t11, t12); // diagonal 1 + QUARTERROUND(t2, t7, t8 , t13); // diagonal 2 + QUARTERROUND(t3, t4, t9 , t14); // diagonal 3 + } + out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3; + out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7; + out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11; + out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15; +} + +static const u8 *chacha20_constant = (const u8*)"expand 32-byte k"; // 16 bytes + +void crypto_chacha20_h(u8 out[32], const u8 key[32], const u8 in [16]) +{ + u32 block[16]; + load32_le_buf(block , chacha20_constant, 4); + load32_le_buf(block + 4, key , 8); + load32_le_buf(block + 12, in , 4); + + chacha20_rounds(block, block); + + // prevent reversal of the rounds by revealing only half of the buffer. + store32_le_buf(out , block , 4); // constant + store32_le_buf(out+16, block+12, 4); // counter and nonce + WIPE_BUFFER(block); +} + +u64 crypto_chacha20_djb(u8 *cipher_text, const u8 *plain_text, + size_t text_size, const u8 key[32], const u8 nonce[8], + u64 ctr) +{ + u32 input[16]; + load32_le_buf(input , chacha20_constant, 4); + load32_le_buf(input + 4, key , 8); + load32_le_buf(input + 14, nonce , 2); + input[12] = (u32) ctr; + input[13] = (u32)(ctr >> 32); + + // Whole blocks + u32 pool[16]; + size_t nb_blocks = text_size >> 6; + FOR (i, 0, nb_blocks) { + chacha20_rounds(pool, input); + if (plain_text != 0) { + FOR (j, 0, 16) { + u32 p = pool[j] + input[j]; + store32_le(cipher_text, p ^ load32_le(plain_text)); + cipher_text += 4; + plain_text += 4; + } + } else { + FOR (j, 0, 16) { + u32 p = pool[j] + input[j]; + store32_le(cipher_text, p); + cipher_text += 4; + } + } + input[12]++; + if (input[12] == 0) { + input[13]++; + } + } + text_size &= 63; + + // Last (incomplete) block + if (text_size > 0) { + if (plain_text == 0) { + plain_text = zero; + } + chacha20_rounds(pool, input); + u8 tmp[64]; + FOR (i, 0, 16) { + store32_le(tmp + i*4, pool[i] + input[i]); + } + FOR (i, 0, text_size) { + cipher_text[i] = tmp[i] ^ plain_text[i]; + } + WIPE_BUFFER(tmp); + } + ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0); + + WIPE_BUFFER(pool); + WIPE_BUFFER(input); + return ctr; +} + +u32 crypto_chacha20_ietf(u8 *cipher_text, const u8 *plain_text, + size_t text_size, + const u8 key[32], const u8 nonce[12], u32 ctr) +{ + u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32); + return (u32)crypto_chacha20_djb(cipher_text, plain_text, text_size, + key, nonce + 4, big_ctr); +} + +u64 crypto_chacha20_x(u8 *cipher_text, const u8 *plain_text, + size_t text_size, + const u8 key[32], const u8 nonce[24], u64 ctr) +{ + u8 sub_key[32]; + crypto_chacha20_h(sub_key, key, nonce); + ctr = crypto_chacha20_djb(cipher_text, plain_text, text_size, + sub_key, nonce + 16, ctr); + WIPE_BUFFER(sub_key); + return ctr; +} + +///////////////// +/// Poly 1305 /// +///////////////// + +// h = (h + c) * r +// preconditions: +// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff +// ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff +// end <= 1 +// Postcondition: +// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff +static void poly_blocks(crypto_poly1305_ctx *ctx, const u8 *in, + size_t nb_blocks, unsigned end) +{ + // Local all the things! + const u32 r0 = ctx->r[0]; + const u32 r1 = ctx->r[1]; + const u32 r2 = ctx->r[2]; + const u32 r3 = ctx->r[3]; + const u32 rr0 = (r0 >> 2) * 5; // lose 2 bits... + const u32 rr1 = (r1 >> 2) + r1; // rr1 == (r1 >> 2) * 5 + const u32 rr2 = (r2 >> 2) + r2; // rr1 == (r2 >> 2) * 5 + const u32 rr3 = (r3 >> 2) + r3; // rr1 == (r3 >> 2) * 5 + const u32 rr4 = r0 & 3; // ...recover 2 bits + u32 h0 = ctx->h[0]; + u32 h1 = ctx->h[1]; + u32 h2 = ctx->h[2]; + u32 h3 = ctx->h[3]; + u32 h4 = ctx->h[4]; + + FOR (i, 0, nb_blocks) { + // h + c, without carry propagation + const u64 s0 = (u64)h0 + load32_le(in); in += 4; + const u64 s1 = (u64)h1 + load32_le(in); in += 4; + const u64 s2 = (u64)h2 + load32_le(in); in += 4; + const u64 s3 = (u64)h3 + load32_le(in); in += 4; + const u32 s4 = h4 + end; + + // (h + c) * r, without carry propagation + const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; + const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; + const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; + const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; + const u32 x4 = s4*rr4; + + // partial reduction modulo 2^130 - 5 + const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5 + const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff); + const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32); + const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32); + const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32); + const u32 u4 = (u3 >> 32) + (u5 & 3); // u4 <= 4 + + // Update the hash + h0 = u0 & 0xffffffff; + h1 = u1 & 0xffffffff; + h2 = u2 & 0xffffffff; + h3 = u3 & 0xffffffff; + h4 = u4; + } + ctx->h[0] = h0; + ctx->h[1] = h1; + ctx->h[2] = h2; + ctx->h[3] = h3; + ctx->h[4] = h4; +} + +void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32]) +{ + ZERO(ctx->h, 5); // Initial hash is zero + ctx->c_idx = 0; + // load r and pad (r has some of its bits cleared) + load32_le_buf(ctx->r , key , 4); + load32_le_buf(ctx->pad, key+16, 4); + FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; } + FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; } +} + +void crypto_poly1305_update(crypto_poly1305_ctx *ctx, + const u8 *message, size_t message_size) +{ + // Avoid undefined NULL pointer increments with empty messages + if (message_size == 0) { + return; + } + + // Align ourselves with block boundaries + size_t aligned = MIN(gap(ctx->c_idx, 16), message_size); + FOR (i, 0, aligned) { + ctx->c[ctx->c_idx] = *message; + ctx->c_idx++; + message++; + message_size--; + } + + // If block is complete, process it + if (ctx->c_idx == 16) { + poly_blocks(ctx, ctx->c, 1, 1); + ctx->c_idx = 0; + } + + // Process the message block by block + size_t nb_blocks = message_size >> 4; + poly_blocks(ctx, message, nb_blocks, 1); + message += nb_blocks << 4; + message_size &= 15; + + // remaining bytes (we never complete a block here) + FOR (i, 0, message_size) { + ctx->c[ctx->c_idx] = message[i]; + ctx->c_idx++; + } +} + +void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16]) +{ + // Process the last block (if any) + // We move the final 1 according to remaining input length + // (this will add less than 2^130 to the last input block) + if (ctx->c_idx != 0) { + ZERO(ctx->c + ctx->c_idx, 16 - ctx->c_idx); + ctx->c[ctx->c_idx] = 1; + poly_blocks(ctx, ctx->c, 1, 0); + } + + // check if we should subtract 2^130-5 by performing the + // corresponding carry propagation. + u64 c = 5; + FOR (i, 0, 4) { + c += ctx->h[i]; + c >>= 32; + } + c += ctx->h[4]; + c = (c >> 2) * 5; // shift the carry back to the beginning + // c now indicates how many times we should subtract 2^130-5 (0 or 1) + FOR (i, 0, 4) { + c += (u64)ctx->h[i] + ctx->pad[i]; + store32_le(mac + i*4, (u32)c); + c = c >> 32; + } + WIPE_CTX(ctx); +} + +void crypto_poly1305(u8 mac[16], const u8 *message, + size_t message_size, const u8 key[32]) +{ + crypto_poly1305_ctx ctx; + crypto_poly1305_init (&ctx, key); + crypto_poly1305_update(&ctx, message, message_size); + crypto_poly1305_final (&ctx, mac); +} + +//////////////// +/// BLAKE2 b /// +//////////////// +static const u64 iv[8] = { + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block) +{ + static const u8 sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + }; + + // increment input offset + u64 *x = ctx->input_offset; + size_t y = ctx->input_idx; + x[0] += y; + if (x[0] < y) { + x[1]++; + } + + // init work vector + u64 v0 = ctx->hash[0]; u64 v8 = iv[0]; + u64 v1 = ctx->hash[1]; u64 v9 = iv[1]; + u64 v2 = ctx->hash[2]; u64 v10 = iv[2]; + u64 v3 = ctx->hash[3]; u64 v11 = iv[3]; + u64 v4 = ctx->hash[4]; u64 v12 = iv[4] ^ ctx->input_offset[0]; + u64 v5 = ctx->hash[5]; u64 v13 = iv[5] ^ ctx->input_offset[1]; + u64 v6 = ctx->hash[6]; u64 v14 = iv[6] ^ (u64)~(is_last_block - 1); + u64 v7 = ctx->hash[7]; u64 v15 = iv[7]; + + // mangle work vector + u64 *input = ctx->input; +#define BLAKE2_G(a, b, c, d, x, y) \ + a += b + x; d = rotr64(d ^ a, 32); \ + c += d; b = rotr64(b ^ c, 24); \ + a += b + y; d = rotr64(d ^ a, 16); \ + c += d; b = rotr64(b ^ c, 63) +#define BLAKE2_ROUND(i) \ + BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \ + BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \ + BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \ + BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \ + BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \ + BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \ + BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \ + BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]]) + +#ifdef BLAKE2_NO_UNROLLING + FOR (i, 0, 12) { + BLAKE2_ROUND(i); + } +#else + BLAKE2_ROUND(0); BLAKE2_ROUND(1); BLAKE2_ROUND(2); BLAKE2_ROUND(3); + BLAKE2_ROUND(4); BLAKE2_ROUND(5); BLAKE2_ROUND(6); BLAKE2_ROUND(7); + BLAKE2_ROUND(8); BLAKE2_ROUND(9); BLAKE2_ROUND(10); BLAKE2_ROUND(11); +#endif + + // update hash + ctx->hash[0] ^= v0 ^ v8; ctx->hash[1] ^= v1 ^ v9; + ctx->hash[2] ^= v2 ^ v10; ctx->hash[3] ^= v3 ^ v11; + ctx->hash[4] ^= v4 ^ v12; ctx->hash[5] ^= v5 ^ v13; + ctx->hash[6] ^= v6 ^ v14; ctx->hash[7] ^= v7 ^ v15; +} + +void crypto_blake2b_keyed_init(crypto_blake2b_ctx *ctx, size_t hash_size, + const u8 *key, size_t key_size) +{ + // initial hash + COPY(ctx->hash, iv, 8); + ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size; + + ctx->input_offset[0] = 0; // beginning of the input, no offset + ctx->input_offset[1] = 0; // beginning of the input, no offset + ctx->hash_size = hash_size; + ctx->input_idx = 0; + ZERO(ctx->input, 16); + + // if there is a key, the first block is that key (padded with zeroes) + if (key_size > 0) { + u8 key_block[128] = {0}; + COPY(key_block, key, key_size); + // same as calling crypto_blake2b_update(ctx, key_block , 128) + load64_le_buf(ctx->input, key_block, 16); + ctx->input_idx = 128; + } +} + +void crypto_blake2b_init(crypto_blake2b_ctx *ctx, size_t hash_size) +{ + crypto_blake2b_keyed_init(ctx, hash_size, 0, 0); +} + +void crypto_blake2b_update(crypto_blake2b_ctx *ctx, + const u8 *message, size_t message_size) +{ + // Avoid undefined NULL pointer increments with empty messages + if (message_size == 0) { + return; + } + + // Align with word boundaries + if ((ctx->input_idx & 7) != 0) { + size_t nb_bytes = MIN(gap(ctx->input_idx, 8), message_size); + size_t word = ctx->input_idx >> 3; + size_t byte = ctx->input_idx & 7; + FOR (i, 0, nb_bytes) { + ctx->input[word] |= (u64)message[i] << ((byte + i) << 3); + } + ctx->input_idx += nb_bytes; + message += nb_bytes; + message_size -= nb_bytes; + } + + // Align with block boundaries (faster than byte by byte) + if ((ctx->input_idx & 127) != 0) { + size_t nb_words = MIN(gap(ctx->input_idx, 128), message_size) >> 3; + load64_le_buf(ctx->input + (ctx->input_idx >> 3), message, nb_words); + ctx->input_idx += nb_words << 3; + message += nb_words << 3; + message_size -= nb_words << 3; + } + + // Process block by block + size_t nb_blocks = message_size >> 7; + FOR (i, 0, nb_blocks) { + if (ctx->input_idx == 128) { + blake2b_compress(ctx, 0); + } + load64_le_buf(ctx->input, message, 16); + message += 128; + ctx->input_idx = 128; + } + message_size &= 127; + + if (message_size != 0) { + // Compress block & flush input buffer as needed + if (ctx->input_idx == 128) { + blake2b_compress(ctx, 0); + ctx->input_idx = 0; + } + if (ctx->input_idx == 0) { + ZERO(ctx->input, 16); + } + // Fill remaining words (faster than byte by byte) + size_t nb_words = message_size >> 3; + load64_le_buf(ctx->input, message, nb_words); + ctx->input_idx += nb_words << 3; + message += nb_words << 3; + message_size -= nb_words << 3; + + // Fill remaining bytes + FOR (i, 0, message_size) { + size_t word = ctx->input_idx >> 3; + size_t byte = ctx->input_idx & 7; + ctx->input[word] |= (u64)message[i] << (byte << 3); + ctx->input_idx++; + } + } +} + +void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash) +{ + blake2b_compress(ctx, 1); // compress the last block + size_t hash_size = MIN(ctx->hash_size, 64); + size_t nb_words = hash_size >> 3; + store64_le_buf(hash, ctx->hash, nb_words); + FOR (i, nb_words << 3, hash_size) { + hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff; + } + WIPE_CTX(ctx); +} + +void crypto_blake2b_keyed(u8 *hash, size_t hash_size, + const u8 *key, size_t key_size, + const u8 *message, size_t message_size) +{ + crypto_blake2b_ctx ctx; + crypto_blake2b_keyed_init(&ctx, hash_size, key, key_size); + crypto_blake2b_update (&ctx, message, message_size); + crypto_blake2b_final (&ctx, hash); +} + +void crypto_blake2b(u8 *hash, size_t hash_size, const u8 *msg, size_t msg_size) +{ + crypto_blake2b_keyed(hash, hash_size, 0, 0, msg, msg_size); +} + +////////////// +/// Argon2 /// +////////////// +// references to R, Z, Q etc. come from the spec + +// Argon2 operates on 1024 byte blocks. +typedef struct { u64 a[128]; } blk; + +// updates a BLAKE2 hash with a 32 bit word, little endian. +static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input) +{ + u8 buf[4]; + store32_le(buf, input); + crypto_blake2b_update(ctx, buf, 4); + WIPE_BUFFER(buf); +} + +static void blake_update_32_buf(crypto_blake2b_ctx *ctx, + const u8 *buf, u32 size) +{ + blake_update_32(ctx, size); + crypto_blake2b_update(ctx, buf, size); +} + + +static void copy_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i] = in->a[i];} +static void xor_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i] ^= in->a[i];} + +// Hash with a virtually unlimited digest size. +// Doesn't extract more entropy than the base hash function. +// Mainly used for filling a whole kilobyte block with pseudo-random bytes. +// (One could use a stream cipher with a seed hash as the key, but +// this would introduce another dependency —and point of failure.) +static void extended_hash(u8 *digest, u32 digest_size, + const u8 *input , u32 input_size) +{ + crypto_blake2b_ctx ctx; + crypto_blake2b_init (&ctx, MIN(digest_size, 64)); + blake_update_32 (&ctx, digest_size); + crypto_blake2b_update(&ctx, input, input_size); + crypto_blake2b_final (&ctx, digest); + + if (digest_size > 64) { + // the conversion to u64 avoids integer overflow on + // ludicrously big hash sizes. + u32 r = (u32)(((u64)digest_size + 31) >> 5) - 2; + u32 i = 1; + u32 in = 0; + u32 out = 32; + while (i < r) { + // Input and output overlap. This is intentional + crypto_blake2b(digest + out, 64, digest + in, 64); + i += 1; + in += 32; + out += 32; + } + crypto_blake2b(digest + out, digest_size - (32 * r), digest + in , 64); + } +} + +#define LSB(x) ((u64)(u32)x) +#define G(a, b, c, d) \ + a += b + ((LSB(a) * LSB(b)) << 1); d ^= a; d = rotr64(d, 32); \ + c += d + ((LSB(c) * LSB(d)) << 1); b ^= c; b = rotr64(b, 24); \ + a += b + ((LSB(a) * LSB(b)) << 1); d ^= a; d = rotr64(d, 16); \ + c += d + ((LSB(c) * LSB(d)) << 1); b ^= c; b = rotr64(b, 63) +#define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \ + v8, v9, v10, v11, v12, v13, v14, v15) \ + G(v0, v4, v8, v12); G(v1, v5, v9, v13); \ + G(v2, v6, v10, v14); G(v3, v7, v11, v15); \ + G(v0, v5, v10, v15); G(v1, v6, v11, v12); \ + G(v2, v7, v8, v13); G(v3, v4, v9, v14) + +// Core of the compression function G. Computes Z from R in place. +static void g_rounds(blk *b) +{ + // column rounds (work_block = Q) + for (int i = 0; i < 128; i += 16) { + ROUND(b->a[i ], b->a[i+ 1], b->a[i+ 2], b->a[i+ 3], + b->a[i+ 4], b->a[i+ 5], b->a[i+ 6], b->a[i+ 7], + b->a[i+ 8], b->a[i+ 9], b->a[i+10], b->a[i+11], + b->a[i+12], b->a[i+13], b->a[i+14], b->a[i+15]); + } + // row rounds (b = Z) + for (int i = 0; i < 16; i += 2) { + ROUND(b->a[i ], b->a[i+ 1], b->a[i+ 16], b->a[i+ 17], + b->a[i+32], b->a[i+33], b->a[i+ 48], b->a[i+ 49], + b->a[i+64], b->a[i+65], b->a[i+ 80], b->a[i+ 81], + b->a[i+96], b->a[i+97], b->a[i+112], b->a[i+113]); + } +} + +const crypto_argon2_extras crypto_argon2_no_extras = { 0, 0, 0, 0 }; + +void crypto_argon2(u8 *hash, u32 hash_size, void *work_area, + crypto_argon2_config config, + crypto_argon2_inputs inputs, + crypto_argon2_extras extras) +{ + const u32 segment_size = config.nb_blocks / config.nb_lanes / 4; + const u32 lane_size = segment_size * 4; + const u32 nb_blocks = lane_size * config.nb_lanes; // rounding down + + // work area seen as blocks (must be suitably aligned) + blk *blocks = (blk*)work_area; + { + u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes + crypto_blake2b_ctx ctx; + crypto_blake2b_init (&ctx, 64); + blake_update_32 (&ctx, config.nb_lanes ); // p: number of "threads" + blake_update_32 (&ctx, hash_size); + blake_update_32 (&ctx, config.nb_blocks); + blake_update_32 (&ctx, config.nb_passes); + blake_update_32 (&ctx, 0x13); // v: version number + blake_update_32 (&ctx, config.algorithm); // y: Argon2i, Argon2d... + blake_update_32_buf (&ctx, inputs.pass, inputs.pass_size); + blake_update_32_buf (&ctx, inputs.salt, inputs.salt_size); + blake_update_32_buf (&ctx, extras.key, extras.key_size); + blake_update_32_buf (&ctx, extras.ad, extras.ad_size); + crypto_blake2b_final(&ctx, initial_hash); // fill 64 first bytes only + + // fill first 2 blocks of each lane + u8 hash_area[1024]; + FOR_T(u32, l, 0, config.nb_lanes) { + FOR_T(u32, i, 0, 2) { + store32_le(initial_hash + 64, i); // first additional word + store32_le(initial_hash + 68, l); // second additional word + extended_hash(hash_area, 1024, initial_hash, 72); + load64_le_buf(blocks[l * lane_size + i].a, hash_area, 128); + } + } + + WIPE_BUFFER(initial_hash); + WIPE_BUFFER(hash_area); + } + + // Argon2i and Argon2id start with constant time indexing + int constant_time = config.algorithm != CRYPTO_ARGON2_D; + + // Fill (and re-fill) the rest of the blocks + // + // Note: even though each segment within the same slice can be + // computed in parallel, (one thread per lane), we are computing + // them sequentially, because Monocypher doesn't support threads. + // + // Yet optimal performance (and therefore security) requires one + // thread per lane. The only reason Monocypher supports multiple + // lanes is compatibility. + blk tmp; + FOR_T(u32, pass, 0, config.nb_passes) { + FOR_T(u32, slice, 0, 4) { + // On the first slice of the first pass, + // blocks 0 and 1 are already filled, hence pass_offset. + u32 pass_offset = pass == 0 && slice == 0 ? 2 : 0; + u32 slice_offset = slice * segment_size; + + // Argon2id switches back to non-constant time indexing + // after the first two slices of the first pass + if (slice == 2 && config.algorithm == CRYPTO_ARGON2_ID) { + constant_time = 0; + } + + // Each iteration of the following loop may be performed in + // a separate thread. All segments must be fully completed + // before we start filling the next slice. + FOR_T(u32, segment, 0, config.nb_lanes) { + blk index_block; + u32 index_ctr = 1; + FOR_T (u32, block, pass_offset, segment_size) { + // Current and previous blocks + u32 lane_offset = segment * lane_size; + blk *segment_start = blocks + lane_offset + slice_offset; + blk *current = segment_start + block; + blk *previous = + block == 0 && slice_offset == 0 + ? segment_start + lane_size - 1 + : segment_start + block - 1; + + u64 index_seed; + if (constant_time) { + if (block == pass_offset || (block % 128) == 0) { + // Fill or refresh deterministic indices block + + // seed the beginning of the block... + ZERO(index_block.a, 128); + index_block.a[0] = pass; + index_block.a[1] = segment; + index_block.a[2] = slice; + index_block.a[3] = nb_blocks; + index_block.a[4] = config.nb_passes; + index_block.a[5] = config.algorithm; + index_block.a[6] = index_ctr; + index_ctr++; + + // ... then shuffle it + copy_block(&tmp, &index_block); + g_rounds (&index_block); + xor_block (&index_block, &tmp); + copy_block(&tmp, &index_block); + g_rounds (&index_block); + xor_block (&index_block, &tmp); + } + index_seed = index_block.a[block % 128]; + } else { + index_seed = previous->a[0]; + } + + // Establish the reference set. *Approximately* comprises: + // - The last 3 slices (if they exist yet) + // - The already constructed blocks in the current segment + u32 next_slice = ((slice + 1) % 4) * segment_size; + u32 window_start = pass == 0 ? 0 : next_slice; + u32 nb_segments = pass == 0 ? slice : 3; + u64 lane = + pass == 0 && slice == 0 + ? segment + : (index_seed >> 32) % config.nb_lanes; + u32 window_size = + nb_segments * segment_size + + (lane == segment ? block-1 : + block == 0 ? (u32)-1 : 0); + + // Find reference block + u64 j1 = index_seed & 0xffffffff; // block selector + u64 x = (j1 * j1) >> 32; + u64 y = (window_size * x) >> 32; + u64 z = (window_size - 1) - y; + u64 ref = (window_start + z) % lane_size; + u32 index = lane * lane_size + (u32)ref; + blk *reference = blocks + index; + + // Shuffle the previous & reference block + // into the current block + copy_block(&tmp, previous); + xor_block (&tmp, reference); + if (pass == 0) { copy_block(current, &tmp); } + else { xor_block (current, &tmp); } + g_rounds (&tmp); + xor_block (current, &tmp); + } + } + } + } + + // Wipe temporary block + volatile u64* p = tmp.a; + ZERO(p, 128); + + // XOR last blocks of each lane + blk *last_block = blocks + lane_size - 1; + FOR_T (u32, lane, 1, config.nb_lanes) { + blk *next_block = last_block + lane_size; + xor_block(next_block, last_block); + last_block = next_block; + } + + // Serialize last block + u8 final_block[1024]; + store64_le_buf(final_block, last_block->a, 128); + + // Wipe work area + p = (u64*)work_area; + ZERO(p, 128 * nb_blocks); + + // Hash the very last block with H' into the output hash + extended_hash(hash, hash_size, final_block, 1024); + WIPE_BUFFER(final_block); +} + +//////////////////////////////////// +/// Arithmetic modulo 2^255 - 19 /// +//////////////////////////////////// +// Originally taken from SUPERCOP's ref10 implementation. +// A bit bigger than TweetNaCl, over 4 times faster. + +// field element +typedef i32 fe[10]; + +// field constants +// +// fe_one : 1 +// sqrtm1 : sqrt(-1) +// d : -121665 / 121666 +// D2 : 2 * -121665 / 121666 +// lop_x, lop_y: low order point in Edwards coordinates +// ufactor : -sqrt(-1) * 2 +// A2 : 486662^2 (A squared) +static const fe fe_one = {1}; +static const fe sqrtm1 = { + -32595792, -7943725, 9377950, 3500415, 12389472, + -272473, -25146209, -2005654, 326686, 11406482, +}; +static const fe d = { + -10913610, 13857413, -15372611, 6949391, 114729, + -8787816, -6275908, -3247719, -18696448, -12055116, +}; +static const fe D2 = { + -21827239, -5839606, -30745221, 13898782, 229458, + 15978800, -12551817, -6495438, 29715968, 9444199, +}; +static const fe lop_x = { + 21352778, 5345713, 4660180, -8347857, 24143090, + 14568123, 30185756, -12247770, -33528939, 8345319, +}; +static const fe lop_y = { + -6952922, -1265500, 6862341, -7057498, -4037696, + -5447722, 31680899, -15325402, -19365852, 1569102, +}; +static const fe ufactor = { + -1917299, 15887451, -18755900, -7000830, -24778944, + 544946, -16816446, 4011309, -653372, 10741468, +}; +static const fe A2 = { + 12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static void fe_0(fe h) { ZERO(h , 10); } +static void fe_1(fe h) { h[0] = 1; ZERO(h+1, 9); } + +static void fe_copy(fe h,const fe f ){FOR(i,0,10) h[i] = f[i]; } +static void fe_neg (fe h,const fe f ){FOR(i,0,10) h[i] = -f[i]; } +static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];} +static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];} + +static void fe_cswap(fe f, fe g, int b) +{ + i32 mask = -b; // -1 = 0xffffffff + FOR (i, 0, 10) { + i32 x = (f[i] ^ g[i]) & mask; + f[i] = f[i] ^ x; + g[i] = g[i] ^ x; + } +} + +static void fe_ccopy(fe f, const fe g, int b) +{ + i32 mask = -b; // -1 = 0xffffffff + FOR (i, 0, 10) { + i32 x = (f[i] ^ g[i]) & mask; + f[i] = f[i] ^ x; + } +} + + +// Signed carry propagation +// ------------------------ +// +// Let t be a number. It can be uniquely decomposed thus: +// +// t = h*2^26 + l +// such that -2^25 <= l < 2^25 +// +// Let c = (t + 2^25) / 2^26 (rounded down) +// c = (h*2^26 + l + 2^25) / 2^26 (rounded down) +// c = h + (l + 2^25) / 2^26 (rounded down) +// c = h (exactly) +// Because 0 <= l + 2^25 < 2^26 +// +// Let u = t - c*2^26 +// u = h*2^26 + l - h*2^26 +// u = l +// Therefore, -2^25 <= u < 2^25 +// +// Additionally, if |t| < x, then |h| < x/2^26 (rounded down) +// +// Notations: +// - In C, 1<<25 means 2^25. +// - In C, x>>25 means floor(x / (2^25)). +// - All of the above applies with 25 & 24 as well as 26 & 25. +// +// +// Note on negative right shifts +// ----------------------------- +// +// In C, x >> n, where x is a negative integer, is implementation +// defined. In practice, all platforms do arithmetic shift, which is +// equivalent to division by 2^26, rounded down. Some compilers, like +// GCC, even guarantee it. +// +// If we ever stumble upon a platform that does not propagate the sign +// bit (we won't), visible failures will show at the slightest test, and +// the signed shifts can be replaced by the following: +// +// typedef struct { i64 x:39; } s25; +// typedef struct { i64 x:38; } s26; +// i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; } +// i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; } +// +// Current compilers cannot optimise this, causing a 30% drop in +// performance. Fairly expensive for something that never happens. +// +// +// Precondition +// ------------ +// +// |t0| < 2^63 +// |t1|..|t9| < 2^62 +// +// Algorithm +// --------- +// c = t0 + 2^25 / 2^26 -- |c| <= 2^36 +// t0 -= c * 2^26 -- |t0| <= 2^25 +// t1 += c -- |t1| <= 2^63 +// +// c = t4 + 2^25 / 2^26 -- |c| <= 2^36 +// t4 -= c * 2^26 -- |t4| <= 2^25 +// t5 += c -- |t5| <= 2^63 +// +// c = t1 + 2^24 / 2^25 -- |c| <= 2^38 +// t1 -= c * 2^25 -- |t1| <= 2^24 +// t2 += c -- |t2| <= 2^63 +// +// c = t5 + 2^24 / 2^25 -- |c| <= 2^38 +// t5 -= c * 2^25 -- |t5| <= 2^24 +// t6 += c -- |t6| <= 2^63 +// +// c = t2 + 2^25 / 2^26 -- |c| <= 2^37 +// t2 -= c * 2^26 -- |t2| <= 2^25 < 1.1 * 2^25 (final t2) +// t3 += c -- |t3| <= 2^63 +// +// c = t6 + 2^25 / 2^26 -- |c| <= 2^37 +// t6 -= c * 2^26 -- |t6| <= 2^25 < 1.1 * 2^25 (final t6) +// t7 += c -- |t7| <= 2^63 +// +// c = t3 + 2^24 / 2^25 -- |c| <= 2^38 +// t3 -= c * 2^25 -- |t3| <= 2^24 < 1.1 * 2^24 (final t3) +// t4 += c -- |t4| <= 2^25 + 2^38 < 2^39 +// +// c = t7 + 2^24 / 2^25 -- |c| <= 2^38 +// t7 -= c * 2^25 -- |t7| <= 2^24 < 1.1 * 2^24 (final t7) +// t8 += c -- |t8| <= 2^63 +// +// c = t4 + 2^25 / 2^26 -- |c| <= 2^13 +// t4 -= c * 2^26 -- |t4| <= 2^25 < 1.1 * 2^25 (final t4) +// t5 += c -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24 (final t5) +// +// c = t8 + 2^25 / 2^26 -- |c| <= 2^37 +// t8 -= c * 2^26 -- |t8| <= 2^25 < 1.1 * 2^25 (final t8) +// t9 += c -- |t9| <= 2^63 +// +// c = t9 + 2^24 / 2^25 -- |c| <= 2^38 +// t9 -= c * 2^25 -- |t9| <= 2^24 < 1.1 * 2^24 (final t9) +// t0 += c * 19 -- |t0| <= 2^25 + 2^38*19 < 2^44 +// +// c = t0 + 2^25 / 2^26 -- |c| <= 2^18 +// t0 -= c * 2^26 -- |t0| <= 2^25 < 1.1 * 2^25 (final t0) +// t1 += c -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24 (final t1) +// +// Postcondition +// ------------- +// |t0|, |t2|, |t4|, |t6|, |t8| < 1.1 * 2^25 +// |t1|, |t3|, |t5|, |t7|, |t9| < 1.1 * 2^24 +#define FE_CARRY \ + i64 c; \ + c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \ + c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \ + c = (t1 + ((i64)1<<24)) >> 25; t1 -= c * ((i64)1 << 25); t2 += c; \ + c = (t5 + ((i64)1<<24)) >> 25; t5 -= c * ((i64)1 << 25); t6 += c; \ + c = (t2 + ((i64)1<<25)) >> 26; t2 -= c * ((i64)1 << 26); t3 += c; \ + c = (t6 + ((i64)1<<25)) >> 26; t6 -= c * ((i64)1 << 26); t7 += c; \ + c = (t3 + ((i64)1<<24)) >> 25; t3 -= c * ((i64)1 << 25); t4 += c; \ + c = (t7 + ((i64)1<<24)) >> 25; t7 -= c * ((i64)1 << 25); t8 += c; \ + c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \ + c = (t8 + ((i64)1<<25)) >> 26; t8 -= c * ((i64)1 << 26); t9 += c; \ + c = (t9 + ((i64)1<<24)) >> 25; t9 -= c * ((i64)1 << 25); t0 += c * 19; \ + c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \ + h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \ + h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9 + +// Decodes a field element from a byte buffer. +// mask specifies how many bits we ignore. +// Traditionally we ignore 1. It's useful for EdDSA, +// which uses that bit to denote the sign of x. +// Elligator however uses positive representatives, +// which means ignoring 2 bits instead. +static void fe_frombytes_mask(fe h, const u8 s[32], unsigned nb_mask) +{ + u32 mask = 0xffffff >> nb_mask; + i64 t0 = load32_le(s); // t0 < 2^32 + i64 t1 = load24_le(s + 4) << 6; // t1 < 2^30 + i64 t2 = load24_le(s + 7) << 5; // t2 < 2^29 + i64 t3 = load24_le(s + 10) << 3; // t3 < 2^27 + i64 t4 = load24_le(s + 13) << 2; // t4 < 2^26 + i64 t5 = load32_le(s + 16); // t5 < 2^32 + i64 t6 = load24_le(s + 20) << 7; // t6 < 2^31 + i64 t7 = load24_le(s + 23) << 5; // t7 < 2^29 + i64 t8 = load24_le(s + 26) << 4; // t8 < 2^28 + i64 t9 = (load24_le(s + 29) & mask) << 2; // t9 < 2^25 + FE_CARRY; // Carry precondition OK +} + +static void fe_frombytes(fe h, const u8 s[32]) +{ + fe_frombytes_mask(h, s, 1); +} + + +// Precondition +// |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]| < 1.1 * 2^25 +// |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]| < 1.1 * 2^24 +// +// Therefore, |h| < 2^255-19 +// There are two possibilities: +// +// - If h is positive, all we need to do is reduce its individual +// limbs down to their tight positive range. +// - If h is negative, we also need to add 2^255-19 to it. +// Or just remove 19 and chop off any excess bit. +static void fe_tobytes(u8 s[32], const fe h) +{ + i32 t[10]; + COPY(t, h, 10); + i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25; + // |t9| < 1.1 * 2^24 + // -1.1 * 2^24 < t9 < 1.1 * 2^24 + // -21 * 2^24 < 19 * t9 < 21 * 2^24 + // -2^29 < 19 * t9 + 2^24 < 2^29 + // -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25 + // -16 < (19 * t9 + 2^24) / 2^25 < 16 + FOR (i, 0, 5) { + q += t[2*i ]; q >>= 26; // q = 0 or -1 + q += t[2*i+1]; q >>= 25; // q = 0 or -1 + } + // q = 0 iff h >= 0 + // q = -1 iff h < 0 + // Adding q * 19 to h reduces h to its proper range. + q *= 19; // Shift carry back to the beginning + FOR (i, 0, 5) { + t[i*2 ] += q; q = t[i*2 ] >> 26; t[i*2 ] -= q * ((i32)1 << 26); + t[i*2+1] += q; q = t[i*2+1] >> 25; t[i*2+1] -= q * ((i32)1 << 25); + } + // h is now fully reduced, and q represents the excess bit. + + store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26)); + store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19)); + store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13)); + store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6)); + store32_le(s + 16, ((u32)t[5] >> 0) | ((u32)t[6] << 25)); + store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19)); + store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12)); + store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6)); + + WIPE_BUFFER(t); +} + +// Precondition +// ------------- +// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 +// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 +// +// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26 +// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25 +static void fe_mul_small(fe h, const fe f, i32 g) +{ + i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g; + i64 t2 = f[2] * (i64) g; i64 t3 = f[3] * (i64) g; + i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g; + i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g; + i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g; + // |t0|, |t2|, |t4|, |t6|, |t8| < 1.65 * 2^26 * 2^31 < 2^58 + // |t1|, |t3|, |t5|, |t7|, |t9| < 1.65 * 2^25 * 2^31 < 2^57 + + FE_CARRY; // Carry precondition OK +} + +// Precondition +// ------------- +// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 +// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 +// +// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26 +// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25 +static void fe_mul(fe h, const fe f, const fe g) +{ + // Everything is unrolled and put in temporary variables. + // We could roll the loop, but that would make curve25519 twice as slow. + i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; + i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; + i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4]; + i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9]; + i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2; + i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19; + i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19; + i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19; + // |F1|, |F3|, |F5|, |F7|, |F9| < 1.65 * 2^26 + // |G0|, |G2|, |G4|, |G6|, |G8| < 2^31 + // |G1|, |G3|, |G5|, |G7|, |G9| < 2^30 + + i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6 + + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1; + i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7 + + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2; + i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8 + + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3; + i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9 + + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4; + i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0 + + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5; + i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1 + + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6; + i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2 + + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7; + i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3 + + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8; + i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4 + + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9; + i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5 + + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0; + // t0 < 0.67 * 2^61 + // t1 < 0.41 * 2^61 + // t2 < 0.52 * 2^61 + // t3 < 0.32 * 2^61 + // t4 < 0.38 * 2^61 + // t5 < 0.22 * 2^61 + // t6 < 0.23 * 2^61 + // t7 < 0.13 * 2^61 + // t8 < 0.09 * 2^61 + // t9 < 0.03 * 2^61 + + FE_CARRY; // Everything below 2^62, Carry precondition OK +} + +// Precondition +// ------------- +// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 +// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 +// +// Note: we could use fe_mul() for this, but this is significantly faster +static void fe_sq(fe h, const fe f) +{ + i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; + i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; + i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; + i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; + i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; + i32 f8_19 = f8*19; i32 f9_38 = f9*38; + // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2| < 1.65 * 2^27 + // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2| < 1.65 * 2^26 + // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| < 2^31 + + i64 t0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 + + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; + i64 t1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 + + f4 *(i64)f7_38 + f5_2*(i64)f6_19; + i64 t2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 + + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; + i64 t3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 + + f5_2*(i64)f8_19 + f6 *(i64)f7_38; + i64 t4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 + + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; + i64 t5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 + + f6 *(i64)f9_38 + f7_2*(i64)f8_19; + i64 t6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 + + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; + i64 t7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 + + f3_2*(i64)f4 + f8 *(i64)f9_38; + i64 t8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 + + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; + i64 t9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 + + f3_2*(i64)f6 + f4 *(i64)f5_2; + // t0 < 0.67 * 2^61 + // t1 < 0.41 * 2^61 + // t2 < 0.52 * 2^61 + // t3 < 0.32 * 2^61 + // t4 < 0.38 * 2^61 + // t5 < 0.22 * 2^61 + // t6 < 0.23 * 2^61 + // t7 < 0.13 * 2^61 + // t8 < 0.09 * 2^61 + // t9 < 0.03 * 2^61 + + FE_CARRY; +} + +// Parity check. Returns 0 if even, 1 if odd +static int fe_isodd(const fe f) +{ + u8 s[32]; + fe_tobytes(s, f); + u8 isodd = s[0] & 1; + WIPE_BUFFER(s); + return isodd; +} + +// Returns 1 if equal, 0 if not equal +static int fe_isequal(const fe f, const fe g) +{ + u8 fs[32]; + u8 gs[32]; + fe_tobytes(fs, f); + fe_tobytes(gs, g); + int isdifferent = crypto_verify32(fs, gs); + WIPE_BUFFER(fs); + WIPE_BUFFER(gs); + return 1 + isdifferent; +} + +// Inverse square root. +// Returns true if x is a square, false otherwise. +// After the call: +// isr = sqrt(1/x) if x is a non-zero square. +// isr = sqrt(sqrt(-1)/x) if x is not a square. +// isr = 0 if x is zero. +// We do not guarantee the sign of the square root. +// +// Notes: +// Let quartic = x^((p-1)/4) +// +// x^((p-1)/2) = chi(x) +// quartic^2 = chi(x) +// quartic = sqrt(chi(x)) +// quartic = 1 or -1 or sqrt(-1) or -sqrt(-1) +// +// Note that x is a square if quartic is 1 or -1 +// There are 4 cases to consider: +// +// if quartic = 1 (x is a square) +// then x^((p-1)/4) = 1 +// x^((p-5)/4) * x = 1 +// x^((p-5)/4) = 1/x +// x^((p-5)/8) = sqrt(1/x) or -sqrt(1/x) +// +// if quartic = -1 (x is a square) +// then x^((p-1)/4) = -1 +// x^((p-5)/4) * x = -1 +// x^((p-5)/4) = -1/x +// x^((p-5)/8) = sqrt(-1) / sqrt(x) +// x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x) +// x^((p-5)/8) * sqrt(-1) = -1/sqrt(x) +// x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x) +// +// if quartic = sqrt(-1) (x is not a square) +// then x^((p-1)/4) = sqrt(-1) +// x^((p-5)/4) * x = sqrt(-1) +// x^((p-5)/4) = sqrt(-1)/x +// x^((p-5)/8) = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x) +// +// Note that the product of two non-squares is always a square: +// For any non-squares a and b, chi(a) = -1 and chi(b) = -1. +// Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1. +// Therefore a*b is a square. +// +// Since sqrt(-1) and x are both non-squares, their product is a +// square, and we can compute their square root. +// +// if quartic = -sqrt(-1) (x is not a square) +// then x^((p-1)/4) = -sqrt(-1) +// x^((p-5)/4) * x = -sqrt(-1) +// x^((p-5)/4) = -sqrt(-1)/x +// x^((p-5)/8) = sqrt(-sqrt(-1)/x) +// x^((p-5)/8) = sqrt( sqrt(-1)/x) * sqrt(-1) +// x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2 +// x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1 +// x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x) +static int invsqrt(fe isr, const fe x) +{ + fe t0, t1, t2; + + // t0 = x^((p-5)/8) + // Can be achieved with a simple double & add ladder, + // but it would be slower. + fe_sq(t0, x); + fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, x, t1); + fe_mul(t0, t0, t1); + fe_sq(t0, t0); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 5) { fe_sq(t1, t1); } fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 10) { fe_sq(t1, t1); } fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 20) { fe_sq(t2, t2); } fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 10) { fe_sq(t1, t1); } fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 50) { fe_sq(t1, t1); } fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 100) { fe_sq(t2, t2); } fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 50) { fe_sq(t1, t1); } fe_mul(t0, t1, t0); + fe_sq(t0, t0); FOR (i, 1, 2) { fe_sq(t0, t0); } fe_mul(t0, t0, x); + + // quartic = x^((p-1)/4) + i32 *quartic = t1; + fe_sq (quartic, t0); + fe_mul(quartic, quartic, x); + + i32 *check = t2; + fe_0 (check); int z0 = fe_isequal(x , check); + fe_1 (check); int p1 = fe_isequal(quartic, check); + fe_neg(check, check ); int m1 = fe_isequal(quartic, check); + fe_neg(check, sqrtm1); int ms = fe_isequal(quartic, check); + + // if quartic == -1 or sqrt(-1) + // then isr = x^((p-1)/4) * sqrt(-1) + // else isr = x^((p-1)/4) + fe_mul(isr, t0, sqrtm1); + fe_ccopy(isr, t0, 1 - (m1 | ms)); + + WIPE_BUFFER(t0); + WIPE_BUFFER(t1); + WIPE_BUFFER(t2); + return p1 | m1 | z0; +} + +// Inverse in terms of inverse square root. +// Requires two additional squarings to get rid of the sign. +// +// 1/x = x * (+invsqrt(x^2))^2 +// = x * (-invsqrt(x^2))^2 +// +// A fully optimised exponentiation by p-1 would save 6 field +// multiplications, but it would require more code. +static void fe_invert(fe out, const fe x) +{ + fe tmp; + fe_sq(tmp, x); + invsqrt(tmp, tmp); + fe_sq(tmp, tmp); + fe_mul(out, tmp, x); + WIPE_BUFFER(tmp); +} + +// trim a scalar for scalar multiplication +void crypto_eddsa_trim_scalar(u8 out[32], const u8 in[32]) +{ + COPY(out, in, 32); + out[ 0] &= 248; + out[31] &= 127; + out[31] |= 64; +} + +// get bit from scalar at position i +static int scalar_bit(const u8 s[32], int i) +{ + if (i < 0) { return 0; } // handle -1 for sliding windows + return (s[i>>3] >> (i&7)) & 1; +} + +/////////////// +/// X-25519 /// Taken from SUPERCOP's ref10 implementation. +/////////////// +static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32], + int nb_bits) +{ + // computes the scalar product + fe x1; + fe_frombytes(x1, p); + + // computes the actual scalar product (the result is in x2 and z2) + fe x2, z2, x3, z3, t0, t1; + // Montgomery ladder + // In projective coordinates, to avoid divisions: x = X / Z + // We don't care about the y coordinate, it's only 1 bit of information + fe_1(x2); fe_0(z2); // "zero" point + fe_copy(x3, x1); fe_1(z3); // "one" point + int swap = 0; + for (int pos = nb_bits-1; pos >= 0; --pos) { + // constant time conditional swap before ladder step + int b = scalar_bit(scalar, pos); + swap ^= b; // xor trick avoids swapping at the end of the loop + fe_cswap(x2, x3, swap); + fe_cswap(z2, z3, swap); + swap = b; // anticipates one last swap after the loop + + // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3) + // with differential addition + fe_sub(t0, x3, z3); + fe_sub(t1, x2, z2); + fe_add(x2, x2, z2); + fe_add(z2, x3, z3); + fe_mul(z3, t0, x2); + fe_mul(z2, z2, t1); + fe_sq (t0, t1 ); + fe_sq (t1, x2 ); + fe_add(x3, z3, z2); + fe_sub(z2, z3, z2); + fe_mul(x2, t1, t0); + fe_sub(t1, t1, t0); + fe_sq (z2, z2 ); + fe_mul_small(z3, t1, 121666); + fe_sq (x3, x3 ); + fe_add(t0, t0, z3); + fe_mul(z3, x1, z2); + fe_mul(z2, t1, t0); + } + // last swap is necessary to compensate for the xor trick + // Note: after this swap, P3 == P2 + P1. + fe_cswap(x2, x3, swap); + fe_cswap(z2, z3, swap); + + // normalises the coordinates: x == X / Z + fe_invert(z2, z2); + fe_mul(x2, x2, z2); + fe_tobytes(q, x2); + + WIPE_BUFFER(x1); + WIPE_BUFFER(x2); WIPE_BUFFER(z2); WIPE_BUFFER(t0); + WIPE_BUFFER(x3); WIPE_BUFFER(z3); WIPE_BUFFER(t1); +} + +void crypto_x25519(u8 raw_shared_secret[32], + const u8 your_secret_key [32], + const u8 their_public_key [32]) +{ + // restrict the possible scalar values + u8 e[32]; + crypto_eddsa_trim_scalar(e, your_secret_key); + scalarmult(raw_shared_secret, e, their_public_key, 255); + WIPE_BUFFER(e); +} + +void crypto_x25519_public_key(u8 public_key[32], + const u8 secret_key[32]) +{ + static const u8 base_point[32] = {9}; + crypto_x25519(public_key, secret_key, base_point); +} + +/////////////////////////// +/// Arithmetic modulo L /// +/////////////////////////// +static const u32 L[8] = { + 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de, + 0x00000000, 0x00000000, 0x00000000, 0x10000000, +}; + +// p = a*b + p +static void multiply(u32 p[16], const u32 a[8], const u32 b[8]) +{ + FOR (i, 0, 8) { + u64 carry = 0; + FOR (j, 0, 8) { + carry += p[i+j] + (u64)a[i] * b[j]; + p[i+j] = (u32)carry; + carry >>= 32; + } + p[i+8] = (u32)carry; + } +} + +static int is_above_l(const u32 x[8]) +{ + // We work with L directly, in a 2's complement encoding + // (-L == ~L + 1) + u64 carry = 1; + FOR (i, 0, 8) { + carry += (u64)x[i] + (~L[i] & 0xffffffff); + carry >>= 32; + } + return (int)carry; // carry is either 0 or 1 +} + +// Final reduction modulo L, by conditionally removing L. +// if x < l , then r = x +// if l <= x 2*l, then r = x-l +// otherwise the result will be wrong +static void remove_l(u32 r[8], const u32 x[8]) +{ + u64 carry = (u64)is_above_l(x); + u32 mask = ~(u32)carry + 1; // carry == 0 or 1 + FOR (i, 0, 8) { + carry += (u64)x[i] + (~L[i] & mask); + r[i] = (u32)carry; + carry >>= 32; + } +} + +// Full reduction modulo L (Barrett reduction) +static void mod_l(u8 reduced[32], const u32 x[16]) +{ + static const u32 r[9] = { + 0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d, + 0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf, + }; + // xr = x * r + u32 xr[25] = {0}; + FOR (i, 0, 9) { + u64 carry = 0; + FOR (j, 0, 16) { + carry += xr[i+j] + (u64)r[i] * x[j]; + xr[i+j] = (u32)carry; + carry >>= 32; + } + xr[i+16] = (u32)carry; + } + // xr = floor(xr / 2^512) * L + // Since the result is guaranteed to be below 2*L, + // it is enough to only compute the first 256 bits. + // The division is performed by saying xr[i+16]. (16 * 32 = 512) + ZERO(xr, 8); + FOR (i, 0, 8) { + u64 carry = 0; + FOR (j, 0, 8-i) { + carry += xr[i+j] + (u64)xr[i+16] * L[j]; + xr[i+j] = (u32)carry; + carry >>= 32; + } + } + // xr = x - xr + u64 carry = 1; + FOR (i, 0, 8) { + carry += (u64)x[i] + (~xr[i] & 0xffffffff); + xr[i] = (u32)carry; + carry >>= 32; + } + // Final reduction modulo L (conditional subtraction) + remove_l(xr, xr); + store32_le_buf(reduced, xr, 8); + + WIPE_BUFFER(xr); +} + +void crypto_eddsa_reduce(u8 reduced[32], const u8 expanded[64]) +{ + u32 x[16]; + load32_le_buf(x, expanded, 16); + mod_l(reduced, x); + WIPE_BUFFER(x); +} + +// r = (a * b) + c +void crypto_eddsa_mul_add(u8 r[32], + const u8 a[32], const u8 b[32], const u8 c[32]) +{ + u32 A[8]; load32_le_buf(A, a, 8); + u32 B[8]; load32_le_buf(B, b, 8); + u32 p[16]; load32_le_buf(p, c, 8); ZERO(p + 8, 8); + multiply(p, A, B); + mod_l(r, p); + WIPE_BUFFER(p); + WIPE_BUFFER(A); + WIPE_BUFFER(B); +} + +/////////////// +/// Ed25519 /// +/////////////// + +// Point (group element, ge) in a twisted Edwards curve, +// in extended projective coordinates. +// ge : x = X/Z, y = Y/Z, T = XY/Z +// ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2 +// ge_precomp: Z = 1 +typedef struct { fe X; fe Y; fe Z; fe T; } ge; +typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached; +typedef struct { fe Yp; fe Ym; fe T2; } ge_precomp; + +static void ge_zero(ge *p) +{ + fe_0(p->X); + fe_1(p->Y); + fe_1(p->Z); + fe_0(p->T); +} + +static void ge_tobytes(u8 s[32], const ge *h) +{ + fe recip, x, y; + fe_invert(recip, h->Z); + fe_mul(x, h->X, recip); + fe_mul(y, h->Y, recip); + fe_tobytes(s, y); + s[31] ^= fe_isodd(x) << 7; + + WIPE_BUFFER(recip); + WIPE_BUFFER(x); + WIPE_BUFFER(y); +} + +// h = -s, where s is a point encoded in 32 bytes +// +// Variable time! Inputs must not be secret! +// => Use only to *check* signatures. +// +// From the specifications: +// The encoding of s contains y and the sign of x +// x = sqrt((y^2 - 1) / (d*y^2 + 1)) +// In extended coordinates: +// X = x, Y = y, Z = 1, T = x*y +// +// Note that num * den is a square iff num / den is a square +// If num * den is not a square, the point was not on the curve. +// From the above: +// Let num = y^2 - 1 +// Let den = d*y^2 + 1 +// x = sqrt((y^2 - 1) / (d*y^2 + 1)) +// x = sqrt(num / den) +// x = sqrt(num^2 / (num * den)) +// x = num * sqrt(1 / (num * den)) +// +// Therefore, we can just compute: +// num = y^2 - 1 +// den = d*y^2 + 1 +// isr = invsqrt(num * den) // abort if not square +// x = num * isr +// Finally, negate x if its sign is not as specified. +static int ge_frombytes_neg_vartime(ge *h, const u8 s[32]) +{ + fe_frombytes(h->Y, s); + fe_1(h->Z); + fe_sq (h->T, h->Y); // t = y^2 + fe_mul(h->X, h->T, d ); // x = d*y^2 + fe_sub(h->T, h->T, h->Z); // t = y^2 - 1 + fe_add(h->X, h->X, h->Z); // x = d*y^2 + 1 + fe_mul(h->X, h->T, h->X); // x = (y^2 - 1) * (d*y^2 + 1) + int is_square = invsqrt(h->X, h->X); + if (!is_square) { + return -1; // Not on the curve, abort + } + fe_mul(h->X, h->T, h->X); // x = sqrt((y^2 - 1) / (d*y^2 + 1)) + if (fe_isodd(h->X) == (s[31] >> 7)) { + fe_neg(h->X, h->X); + } + fe_mul(h->T, h->X, h->Y); + return 0; +} + +static void ge_cache(ge_cached *c, const ge *p) +{ + fe_add (c->Yp, p->Y, p->X); + fe_sub (c->Ym, p->Y, p->X); + fe_copy(c->Z , p->Z ); + fe_mul (c->T2, p->T, D2 ); +} + +// Internal buffers are not wiped! Inputs must not be secret! +// => Use only to *check* signatures. +static void ge_add(ge *s, const ge *p, const ge_cached *q) +{ + fe a, b; + fe_add(a , p->Y, p->X ); + fe_sub(b , p->Y, p->X ); + fe_mul(a , a , q->Yp); + fe_mul(b , b , q->Ym); + fe_add(s->Y, a , b ); + fe_sub(s->X, a , b ); + + fe_add(s->Z, p->Z, p->Z ); + fe_mul(s->Z, s->Z, q->Z ); + fe_mul(s->T, p->T, q->T2); + fe_add(a , s->Z, s->T ); + fe_sub(b , s->Z, s->T ); + + fe_mul(s->T, s->X, s->Y); + fe_mul(s->X, s->X, b ); + fe_mul(s->Y, s->Y, a ); + fe_mul(s->Z, a , b ); +} + +// Internal buffers are not wiped! Inputs must not be secret! +// => Use only to *check* signatures. +static void ge_sub(ge *s, const ge *p, const ge_cached *q) +{ + ge_cached neg; + fe_copy(neg.Ym, q->Yp); + fe_copy(neg.Yp, q->Ym); + fe_copy(neg.Z , q->Z ); + fe_neg (neg.T2, q->T2); + ge_add(s, p, &neg); +} + +static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b) +{ + fe_add(a , p->Y, p->X ); + fe_sub(b , p->Y, p->X ); + fe_mul(a , a , q->Yp); + fe_mul(b , b , q->Ym); + fe_add(s->Y, a , b ); + fe_sub(s->X, a , b ); + + fe_add(s->Z, p->Z, p->Z ); + fe_mul(s->T, p->T, q->T2); + fe_add(a , s->Z, s->T ); + fe_sub(b , s->Z, s->T ); + + fe_mul(s->T, s->X, s->Y); + fe_mul(s->X, s->X, b ); + fe_mul(s->Y, s->Y, a ); + fe_mul(s->Z, a , b ); +} + +// Internal buffers are not wiped! Inputs must not be secret! +// => Use only to *check* signatures. +static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b) +{ + ge_precomp neg; + fe_copy(neg.Ym, q->Yp); + fe_copy(neg.Yp, q->Ym); + fe_neg (neg.T2, q->T2); + ge_madd(s, p, &neg, a, b); +} + +static void ge_double(ge *s, const ge *p, ge *q) +{ + fe_sq (q->X, p->X); + fe_sq (q->Y, p->Y); + fe_sq (q->Z, p->Z); // qZ = pZ^2 + fe_mul_small(q->Z, q->Z, 2); // qZ = pZ^2 * 2 + fe_add(q->T, p->X, p->Y); + fe_sq (s->T, q->T); + fe_add(q->T, q->Y, q->X); + fe_sub(q->Y, q->Y, q->X); + fe_sub(q->X, s->T, q->T); + fe_sub(q->Z, q->Z, q->Y); + + fe_mul(s->X, q->X , q->Z); + fe_mul(s->Y, q->T , q->Y); + fe_mul(s->Z, q->Y , q->Z); + fe_mul(s->T, q->X , q->T); +} + +// 5-bit signed window in cached format (Niels coordinates, Z=1) +static const ge_precomp b_window[8] = { + {{25967493,-14356035,29566456,3660896,-12694345, + 4014787,27544626,-11754271,-6079156,2047605,}, + {-12545711,934262,-2722910,3049990,-727428, + 9406986,12720692,5043384,19500929,-15469378,}, + {-8738181,4489570,9688441,-14785194,10184609, + -12363380,29287919,11864899,-24514362,-4438546,},}, + {{15636291,-9688557,24204773,-7912398,616977, + -16685262,27787600,-14772189,28944400,-1550024,}, + {16568933,4717097,-11556148,-1102322,15682896, + -11807043,16354577,-11775962,7689662,11199574,}, + {30464156,-5976125,-11779434,-15670865,23220365, + 15915852,7512774,10017326,-17749093,-9920357,},}, + {{10861363,11473154,27284546,1981175,-30064349, + 12577861,32867885,14515107,-15438304,10819380,}, + {4708026,6336745,20377586,9066809,-11272109, + 6594696,-25653668,12483688,-12668491,5581306,}, + {19563160,16186464,-29386857,4097519,10237984, + -4348115,28542350,13850243,-23678021,-15815942,},}, + {{5153746,9909285,1723747,-2777874,30523605, + 5516873,19480852,5230134,-23952439,-15175766,}, + {-30269007,-3463509,7665486,10083793,28475525, + 1649722,20654025,16520125,30598449,7715701,}, + {28881845,14381568,9657904,3680757,-20181635, + 7843316,-31400660,1370708,29794553,-1409300,},}, + {{-22518993,-6692182,14201702,-8745502,-23510406, + 8844726,18474211,-1361450,-13062696,13821877,}, + {-6455177,-7839871,3374702,-4740862,-27098617, + -10571707,31655028,-7212327,18853322,-14220951,}, + {4566830,-12963868,-28974889,-12240689,-7602672, + -2830569,-8514358,-10431137,2207753,-3209784,},}, + {{-25154831,-4185821,29681144,7868801,-6854661, + -9423865,-12437364,-663000,-31111463,-16132436,}, + {25576264,-2703214,7349804,-11814844,16472782, + 9300885,3844789,15725684,171356,6466918,}, + {23103977,13316479,9739013,-16149481,817875, + -15038942,8965339,-14088058,-30714912,16193877,},}, + {{-33521811,3180713,-2394130,14003687,-16903474, + -16270840,17238398,4729455,-18074513,9256800,}, + {-25182317,-4174131,32336398,5036987,-21236817, + 11360617,22616405,9761698,-19827198,630305,}, + {-13720693,2639453,-24237460,-7406481,9494427, + -5774029,-6554551,-15960994,-2449256,-14291300,},}, + {{-3151181,-5046075,9282714,6866145,-31907062, + -863023,-18940575,15033784,25105118,-7894876,}, + {-24326370,15950226,-31801215,-14592823,-11662737, + -5090925,1573892,-2625887,2198790,-15804619,}, + {-3099351,10324967,-2241613,7453183,-5446979, + -2735503,-13812022,-16236442,-32461234,-12290683,},}, +}; + +// Incremental sliding windows (left to right) +// Based on Roberto Maria Avanzi[2005] +typedef struct { + i16 next_index; // position of the next signed digit + i8 next_digit; // next signed digit (odd number below 2^window_width) + u8 next_check; // point at which we must check for a new window +} slide_ctx; + +static void slide_init(slide_ctx *ctx, const u8 scalar[32]) +{ + // scalar is guaranteed to be below L, either because we checked (s), + // or because we reduced it modulo L (h_ram). L is under 2^253, so + // so bits 253 to 255 are guaranteed to be zero. No need to test them. + // + // Note however that L is very close to 2^252, so bit 252 is almost + // always zero. If we were to start at bit 251, the tests wouldn't + // catch the off-by-one error (constructing one that does would be + // prohibitively expensive). + // + // We should still check bit 252, though. + int i = 252; + while (i > 0 && scalar_bit(scalar, i) == 0) { + i--; + } + ctx->next_check = (u8)(i + 1); + ctx->next_index = -1; + ctx->next_digit = -1; +} + +static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32]) +{ + if (i == ctx->next_check) { + if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) { + ctx->next_check--; + } else { + // compute digit of next window + int w = MIN(width, i + 1); + int v = -(scalar_bit(scalar, i) << (w-1)); + FOR_T (int, j, 0, w-1) { + v += scalar_bit(scalar, i-(w-1)+j) << j; + } + v += scalar_bit(scalar, i-w); + int lsb = v & (~v + 1); // smallest bit of v + int s = // log2(lsb) + (((lsb & 0xAA) != 0) << 0) | + (((lsb & 0xCC) != 0) << 1) | + (((lsb & 0xF0) != 0) << 2); + ctx->next_index = (i16)(i-(w-1)+s); + ctx->next_digit = (i8) (v >> s ); + ctx->next_check -= (u8) w; + } + } + return i == ctx->next_index ? ctx->next_digit: 0; +} + +#define P_W_WIDTH 3 // Affects the size of the stack +#define B_W_WIDTH 5 // Affects the size of the binary +#define P_W_SIZE (1<<(P_W_WIDTH-2)) + +int crypto_eddsa_check_equation(const u8 signature[64], const u8 public_key[32], + const u8 h[32]) +{ + ge minus_A; // -public_key + ge minus_R; // -first_half_of_signature + const u8 *s = signature + 32; + + // Check that A and R are on the curve + // Check that 0 <= S < L (prevents malleability) + // *Allow* non-cannonical encoding for A and R + { + u32 s32[8]; + load32_le_buf(s32, s, 8); + if (ge_frombytes_neg_vartime(&minus_A, public_key) || + ge_frombytes_neg_vartime(&minus_R, signature) || + is_above_l(s32)) { + return -1; + } + } + + // look-up table for minus_A + ge_cached lutA[P_W_SIZE]; + { + ge minus_A2, tmp; + ge_double(&minus_A2, &minus_A, &tmp); + ge_cache(&lutA[0], &minus_A); + FOR (i, 1, P_W_SIZE) { + ge_add(&tmp, &minus_A2, &lutA[i-1]); + ge_cache(&lutA[i], &tmp); + } + } + + // sum = [s]B - [h]A + // Merged double and add ladder, fused with sliding + slide_ctx h_slide; slide_init(&h_slide, h); + slide_ctx s_slide; slide_init(&s_slide, s); + int i = MAX(h_slide.next_check, s_slide.next_check); + ge *sum = &minus_A; // reuse minus_A for the sum + ge_zero(sum); + while (i >= 0) { + ge tmp; + ge_double(sum, sum, &tmp); + int h_digit = slide_step(&h_slide, P_W_WIDTH, i, h); + int s_digit = slide_step(&s_slide, B_W_WIDTH, i, s); + if (h_digit > 0) { ge_add(sum, sum, &lutA[ h_digit / 2]); } + if (h_digit < 0) { ge_sub(sum, sum, &lutA[-h_digit / 2]); } + fe t1, t2; + if (s_digit > 0) { ge_madd(sum, sum, b_window + s_digit/2, t1, t2); } + if (s_digit < 0) { ge_msub(sum, sum, b_window + -s_digit/2, t1, t2); } + i--; + } + + // Compare [8](sum-R) and the zero point + // The multiplication by 8 eliminates any low-order component + // and ensures consistency with batched verification. + ge_cached cached; + u8 check[32]; + static const u8 zero_point[32] = {1}; // Point of order 1 + ge_cache(&cached, &minus_R); + ge_add(sum, sum, &cached); + ge_double(sum, sum, &minus_R); // reuse minus_R as temporary + ge_double(sum, sum, &minus_R); // reuse minus_R as temporary + ge_double(sum, sum, &minus_R); // reuse minus_R as temporary + ge_tobytes(check, sum); + return crypto_verify32(check, zero_point); +} + +// 5-bit signed comb in cached format (Niels coordinates, Z=1) +static const ge_precomp b_comb_low[8] = { + {{-6816601,-2324159,-22559413,124364,18015490, + 8373481,19993724,1979872,-18549925,9085059,}, + {10306321,403248,14839893,9633706,8463310, + -8354981,-14305673,14668847,26301366,2818560,}, + {-22701500,-3210264,-13831292,-2927732,-16326337, + -14016360,12940910,177905,12165515,-2397893,},}, + {{-12282262,-7022066,9920413,-3064358,-32147467, + 2927790,22392436,-14852487,2719975,16402117,}, + {-7236961,-4729776,2685954,-6525055,-24242706, + -15940211,-6238521,14082855,10047669,12228189,}, + {-30495588,-12893761,-11161261,3539405,-11502464, + 16491580,-27286798,-15030530,-7272871,-15934455,},}, + {{17650926,582297,-860412,-187745,-12072900, + -10683391,-20352381,15557840,-31072141,-5019061,}, + {-6283632,-2259834,-4674247,-4598977,-4089240, + 12435688,-31278303,1060251,6256175,10480726,}, + {-13871026,2026300,-21928428,-2741605,-2406664, + -8034988,7355518,15733500,-23379862,7489131,},}, + {{6883359,695140,23196907,9644202,-33430614, + 11354760,-20134606,6388313,-8263585,-8491918,}, + {-7716174,-13605463,-13646110,14757414,-19430591, + -14967316,10359532,-11059670,-21935259,12082603,}, + {-11253345,-15943946,10046784,5414629,24840771, + 8086951,-6694742,9868723,15842692,-16224787,},}, + {{9639399,11810955,-24007778,-9320054,3912937, + -9856959,996125,-8727907,-8919186,-14097242,}, + {7248867,14468564,25228636,-8795035,14346339, + 8224790,6388427,-7181107,6468218,-8720783,}, + {15513115,15439095,7342322,-10157390,18005294, + -7265713,2186239,4884640,10826567,7135781,},}, + {{-14204238,5297536,-5862318,-6004934,28095835, + 4236101,-14203318,1958636,-16816875,3837147,}, + {-5511166,-13176782,-29588215,12339465,15325758, + -15945770,-8813185,11075932,-19608050,-3776283,}, + {11728032,9603156,-4637821,-5304487,-7827751, + 2724948,31236191,-16760175,-7268616,14799772,},}, + {{-28842672,4840636,-12047946,-9101456,-1445464, + 381905,-30977094,-16523389,1290540,12798615,}, + {27246947,-10320914,14792098,-14518944,5302070, + -8746152,-3403974,-4149637,-27061213,10749585,}, + {25572375,-6270368,-15353037,16037944,1146292, + 32198,23487090,9585613,24714571,-1418265,},}, + {{19844825,282124,-17583147,11004019,-32004269, + -2716035,6105106,-1711007,-21010044,14338445,}, + {8027505,8191102,-18504907,-12335737,25173494, + -5923905,15446145,7483684,-30440441,10009108,}, + {-14134701,-4174411,10246585,-14677495,33553567, + -14012935,23366126,15080531,-7969992,7663473,},}, +}; + +static const ge_precomp b_comb_high[8] = { + {{33055887,-4431773,-521787,6654165,951411, + -6266464,-5158124,6995613,-5397442,-6985227,}, + {4014062,6967095,-11977872,3960002,8001989, + 5130302,-2154812,-1899602,-31954493,-16173976,}, + {16271757,-9212948,23792794,731486,-25808309, + -3546396,6964344,-4767590,10976593,10050757,},}, + {{2533007,-4288439,-24467768,-12387405,-13450051, + 14542280,12876301,13893535,15067764,8594792,}, + {20073501,-11623621,3165391,-13119866,13188608, + -11540496,-10751437,-13482671,29588810,2197295,}, + {-1084082,11831693,6031797,14062724,14748428, + -8159962,-20721760,11742548,31368706,13161200,},}, + {{2050412,-6457589,15321215,5273360,25484180, + 124590,-18187548,-7097255,-6691621,-14604792,}, + {9938196,2162889,-6158074,-1711248,4278932, + -2598531,-22865792,-7168500,-24323168,11746309,}, + {-22691768,-14268164,5965485,9383325,20443693, + 5854192,28250679,-1381811,-10837134,13717818,},}, + {{-8495530,16382250,9548884,-4971523,-4491811, + -3902147,6182256,-12832479,26628081,10395408,}, + {27329048,-15853735,7715764,8717446,-9215518, + -14633480,28982250,-5668414,4227628,242148,}, + {-13279943,-7986904,-7100016,8764468,-27276630, + 3096719,29678419,-9141299,3906709,11265498,},}, + {{11918285,15686328,-17757323,-11217300,-27548967, + 4853165,-27168827,6807359,6871949,-1075745,}, + {-29002610,13984323,-27111812,-2713442,28107359, + -13266203,6155126,15104658,3538727,-7513788,}, + {14103158,11233913,-33165269,9279850,31014152, + 4335090,-1827936,4590951,13960841,12787712,},}, + {{1469134,-16738009,33411928,13942824,8092558, + -8778224,-11165065,1437842,22521552,-2792954,}, + {31352705,-4807352,-25327300,3962447,12541566, + -9399651,-27425693,7964818,-23829869,5541287,}, + {-25732021,-6864887,23848984,3039395,-9147354, + 6022816,-27421653,10590137,25309915,-1584678,},}, + {{-22951376,5048948,31139401,-190316,-19542447, + -626310,-17486305,-16511925,-18851313,-12985140,}, + {-9684890,14681754,30487568,7717771,-10829709, + 9630497,30290549,-10531496,-27798994,-13812825,}, + {5827835,16097107,-24501327,12094619,7413972, + 11447087,28057551,-1793987,-14056981,4359312,},}, + {{26323183,2342588,-21887793,-1623758,-6062284, + 2107090,-28724907,9036464,-19618351,-13055189,}, + {-29697200,14829398,-4596333,14220089,-30022969, + 2955645,12094100,-13693652,-5941445,7047569,}, + {-3201977,14413268,-12058324,-16417589,-9035655, + -7224648,9258160,1399236,30397584,-5684634,},}, +}; + +static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b, + const ge_precomp comb[8], const u8 scalar[32], int i) +{ + u8 teeth = (u8)((scalar_bit(scalar, i) ) + + (scalar_bit(scalar, i + 32) << 1) + + (scalar_bit(scalar, i + 64) << 2) + + (scalar_bit(scalar, i + 96) << 3)); + u8 high = teeth >> 3; + u8 index = (teeth ^ (high - 1)) & 7; + FOR (j, 0, 8) { + i32 select = 1 & (((j ^ index) - 1) >> 8); + fe_ccopy(tmp_c->Yp, comb[j].Yp, select); + fe_ccopy(tmp_c->Ym, comb[j].Ym, select); + fe_ccopy(tmp_c->T2, comb[j].T2, select); + } + fe_neg(tmp_a, tmp_c->T2); + fe_cswap(tmp_c->T2, tmp_a , high ^ 1); + fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1); + ge_madd(p, p, tmp_c, tmp_a, tmp_b); +} + +// p = [scalar]B, where B is the base point +static void ge_scalarmult_base(ge *p, const u8 scalar[32]) +{ + // twin 4-bits signed combs, from Mike Hamburg's + // Fast and compact elliptic-curve cryptography (2012) + // 1 / 2 modulo L + static const u8 half_mod_L[32] = { + 247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, + }; + // (2^256 - 1) / 2 modulo L + static const u8 half_ones[32] = { + 142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, + }; + + // All bits set form: 1 means 1, 0 means -1 + u8 s_scalar[32]; + crypto_eddsa_mul_add(s_scalar, scalar, half_mod_L, half_ones); + + // Double and add ladder + fe tmp_a, tmp_b; // temporaries for addition + ge_precomp tmp_c; // temporary for comb lookup + ge tmp_d; // temporary for doubling + fe_1(tmp_c.Yp); + fe_1(tmp_c.Ym); + fe_0(tmp_c.T2); + + // Save a double on the first iteration + ge_zero(p); + lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31); + lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128); + // Regular double & add for the rest + for (int i = 30; i >= 0; i--) { + ge_double(p, p, &tmp_d); + lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i); + lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128); + } + // Note: we could save one addition at the end if we assumed the + // scalar fit in 252 bits. Which it does in practice if it is + // selected at random. However, non-random, non-hashed scalars + // *can* overflow 252 bits in practice. Better account for that + // than leaving that kind of subtle corner case. + + WIPE_BUFFER(tmp_a); WIPE_CTX(&tmp_d); + WIPE_BUFFER(tmp_b); WIPE_CTX(&tmp_c); + WIPE_BUFFER(s_scalar); +} + +void crypto_eddsa_scalarbase(u8 point[32], const u8 scalar[32]) +{ + ge P; + ge_scalarmult_base(&P, scalar); + ge_tobytes(point, &P); + WIPE_CTX(&P); +} + +void crypto_eddsa_key_pair(u8 secret_key[64], u8 public_key[32], u8 seed[32]) +{ + // To allow overlaps, observable writes happen in this order: + // 1. seed + // 2. secret_key + // 3. public_key + u8 a[64]; + COPY(a, seed, 32); + crypto_wipe(seed, 32); + COPY(secret_key, a, 32); + crypto_blake2b(a, 64, a, 32); + crypto_eddsa_trim_scalar(a, a); + crypto_eddsa_scalarbase(secret_key + 32, a); + COPY(public_key, secret_key + 32, 32); + WIPE_BUFFER(a); +} + +static void hash_reduce(u8 h[32], + const u8 *a, size_t a_size, + const u8 *b, size_t b_size, + const u8 *c, size_t c_size) +{ + u8 hash[64]; + crypto_blake2b_ctx ctx; + crypto_blake2b_init (&ctx, 64); + crypto_blake2b_update(&ctx, a, a_size); + crypto_blake2b_update(&ctx, b, b_size); + crypto_blake2b_update(&ctx, c, c_size); + crypto_blake2b_final (&ctx, hash); + crypto_eddsa_reduce(h, hash); +} + +// Digital signature of a message with from a secret key. +// +// The secret key comprises two parts: +// - The seed that generates the key (secret_key[ 0..31]) +// - The public key (secret_key[32..63]) +// +// The seed and the public key are bundled together to make sure users +// don't use mismatched seeds and public keys, which would instantly +// leak the secret scalar and allow forgeries (allowing this to happen +// has resulted in critical vulnerabilities in the wild). +// +// The seed is hashed to derive the secret scalar and a secret prefix. +// The sole purpose of the prefix is to generate a secret random nonce. +// The properties of that nonce must be as follows: +// - Unique: we need a different one for each message. +// - Secret: third parties must not be able to predict it. +// - Random: any detectable bias would break all security. +// +// There are two ways to achieve these properties. The obvious one is +// to simply generate a random number. Here that would be a parameter +// (Monocypher doesn't have an RNG). It works, but then users may reuse +// the nonce by accident, which _also_ leaks the secret scalar and +// allows forgeries. This has happened in the wild too. +// +// This is no good, so instead we generate that nonce deterministically +// by reducing modulo L a hash of the secret prefix and the message. +// The secret prefix makes the nonce unpredictable, the message makes it +// unique, and the hash/reduce removes all bias. +// +// The cost of that safety is hashing the message twice. If that cost +// is unacceptable, there are two alternatives: +// +// - Signing a hash of the message instead of the message itself. This +// is fine as long as the hash is collision resistant. It is not +// compatible with existing "pure" signatures, but at least it's safe. +// +// - Using a random nonce. Please exercise **EXTREME CAUTION** if you +// ever do that. It is absolutely **critical** that the nonce is +// really an unbiased random number between 0 and L-1, never reused, +// and wiped immediately. +// +// To lower the likelihood of complete catastrophe if the RNG is +// either flawed or misused, you can hash the RNG output together with +// the secret prefix and the beginning of the message, and use the +// reduction of that hash instead of the RNG output itself. It's not +// foolproof (you'd need to hash the whole message) but it helps. +// +// Signing a message involves the following operations: +// +// scalar, prefix = HASH(secret_key) +// r = HASH(prefix || message) % L +// R = [r]B +// h = HASH(R || public_key || message) % L +// S = ((h * a) + r) % L +// signature = R || S +void crypto_eddsa_sign(u8 signature [64], const u8 secret_key[64], + const u8 *message, size_t message_size) +{ + u8 a[64]; // secret scalar and prefix + u8 r[32]; // secret deterministic "random" nonce + u8 h[32]; // publically verifiable hash of the message (not wiped) + u8 R[32]; // first half of the signature (allows overlapping inputs) + + crypto_blake2b(a, 64, secret_key, 32); + crypto_eddsa_trim_scalar(a, a); + hash_reduce(r, a + 32, 32, message, message_size, 0, 0); + crypto_eddsa_scalarbase(R, r); + hash_reduce(h, R, 32, secret_key + 32, 32, message, message_size); + COPY(signature, R, 32); + crypto_eddsa_mul_add(signature + 32, h, a, r); + + WIPE_BUFFER(a); + WIPE_BUFFER(r); +} + +// To check the signature R, S of the message M with the public key A, +// there are 3 steps: +// +// compute h = HASH(R || A || message) % L +// check that A is on the curve. +// check that R == [s]B - [h]A +// +// The last two steps are done in crypto_eddsa_check_equation() +int crypto_eddsa_check(const u8 signature[64], const u8 public_key[32], + const u8 *message, size_t message_size) +{ + u8 h[32]; + hash_reduce(h, signature, 32, public_key, 32, message, message_size); + return crypto_eddsa_check_equation(signature, public_key, h); +} + +///////////////////////// +/// EdDSA <--> X25519 /// +///////////////////////// +void crypto_eddsa_to_x25519(u8 x25519[32], const u8 eddsa[32]) +{ + // (u, v) = ((1+y)/(1-y), sqrt(-486664)*u/x) + // Only converting y to u, the sign of x is ignored. + fe t1, t2; + fe_frombytes(t2, eddsa); + fe_add(t1, fe_one, t2); + fe_sub(t2, fe_one, t2); + fe_invert(t2, t2); + fe_mul(t1, t1, t2); + fe_tobytes(x25519, t1); + WIPE_BUFFER(t1); + WIPE_BUFFER(t2); +} + +void crypto_x25519_to_eddsa(u8 eddsa[32], const u8 x25519[32]) +{ + // (x, y) = (sqrt(-486664)*u/v, (u-1)/(u+1)) + // Only converting u to y, x is assumed positive. + fe t1, t2; + fe_frombytes(t2, x25519); + fe_sub(t1, t2, fe_one); + fe_add(t2, t2, fe_one); + fe_invert(t2, t2); + fe_mul(t1, t1, t2); + fe_tobytes(eddsa, t1); + WIPE_BUFFER(t1); + WIPE_BUFFER(t2); +} + +///////////////////////////////////////////// +/// Dirty ephemeral public key generation /// +///////////////////////////////////////////// + +// Those functions generates a public key, *without* clearing the +// cofactor. Sending that key over the network leaks 3 bits of the +// private key. Use only to generate ephemeral keys that will be hidden +// with crypto_curve_to_hidden(). +// +// The public key is otherwise compatible with crypto_x25519(), which +// properly clears the cofactor. +// +// Note that the distribution of the resulting public keys is almost +// uniform. Flipping the sign of the v coordinate (not provided by this +// function), covers the entire key space almost perfectly, where +// "almost" means a 2^-128 bias (undetectable). This uniformity is +// needed to ensure the proper randomness of the resulting +// representatives (once we apply crypto_curve_to_hidden()). +// +// Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not +// to be confused with the prime order of the main subgroup, L, which is +// 8 times less than that). +// +// Generating all points would require us to multiply a point of order C +// (the base point plus any point of order 8) by all scalars from 0 to +// C-1. Clamping limits us to scalars between 2^254 and 2^255 - 1. But +// by negating the resulting point at random, we also cover scalars from +// -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e). +// +// In practice: +// - Scalars from 0 to e + 1 are never generated +// - Scalars from 2^255 to 2^255 + e are never generated +// - Scalars from 2^254 + 1 to 2^254 + e are generated twice +// +// Since e < 2^128, detecting this bias requires observing over 2^100 +// representatives from a given source (this will never happen), *and* +// recovering enough of the private key to determine that they do, or do +// not, belong to the biased set (this practically requires solving +// discrete logarithm, which is conjecturally intractable). +// +// In practice, this means the bias is impossible to detect. + +// s + (x*L) % 8*L +// Guaranteed to fit in 256 bits iff s fits in 255 bits. +// L < 2^253 +// x%8 < 2^3 +// L * (x%8) < 2^255 +// s < 2^255 +// s + L * (x%8) < 2^256 +static void add_xl(u8 s[32], u8 x) +{ + u64 mod8 = x & 7; + u64 carry = 0; + FOR (i , 0, 8) { + carry = carry + load32_le(s + 4*i) + L[i] * mod8; + store32_le(s + 4*i, (u32)carry); + carry >>= 32; + } +} + +// "Small" dirty ephemeral key. +// Use if you need to shrink the size of the binary, and can afford to +// slow down by a factor of two (compared to the fast version) +// +// This version works by decoupling the cofactor from the main factor. +// +// - The trimmed scalar determines the main factor +// - The clamped bits of the scalar determine the cofactor. +// +// Cofactor and main factor are combined into a single scalar, which is +// then multiplied by a point of order 8*L (unlike the base point, which +// has prime order). That "dirty" base point is the addition of the +// regular base point (9), and a point of order 8. +void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32]) +{ + // Base point of order 8*L + // Raw scalar multiplication with it does not clear the cofactor, + // and the resulting public key will reveal 3 bits of the scalar. + // + // The low order component of this base point has been chosen + // to yield the same results as crypto_x25519_dirty_fast(). + static const u8 dirty_base_point[32] = { + 0xd8, 0x86, 0x1a, 0xa2, 0x78, 0x7a, 0xd9, 0x26, + 0x8b, 0x74, 0x74, 0xb6, 0x82, 0xe3, 0xbe, 0xc3, + 0xce, 0x36, 0x9a, 0x1e, 0x5e, 0x31, 0x47, 0xa2, + 0x6d, 0x37, 0x7c, 0xfd, 0x20, 0xb5, 0xdf, 0x75, + }; + // separate the main factor & the cofactor of the scalar + u8 scalar[32]; + crypto_eddsa_trim_scalar(scalar, secret_key); + + // Separate the main factor and the cofactor + // + // The scalar is trimmed, so its cofactor is cleared. The three + // least significant bits however still have a main factor. We must + // remove it for X25519 compatibility. + // + // cofactor = lsb * L (modulo 8*L) + // combined = scalar + cofactor (modulo 8*L) + add_xl(scalar, secret_key[0]); + scalarmult(public_key, scalar, dirty_base_point, 256); + WIPE_BUFFER(scalar); +} + +// Select low order point +// We're computing the [cofactor]lop scalar multiplication, where: +// +// cofactor = tweak & 7. +// lop = (lop_x, lop_y) +// lop_x = sqrt((sqrt(d + 1) + 1) / d) +// lop_y = -lop_x * sqrtm1 +// +// The low order point has order 8. There are 4 such points. We've +// chosen the one whose both coordinates are positive (below p/2). +// The 8 low order points are as follows: +// +// [0]lop = ( 0 , 1 ) +// [1]lop = ( lop_x , lop_y) +// [2]lop = ( sqrt(-1), -0 ) +// [3]lop = ( lop_x , -lop_y) +// [4]lop = (-0 , -1 ) +// [5]lop = (-lop_x , -lop_y) +// [6]lop = (-sqrt(-1), 0 ) +// [7]lop = (-lop_x , lop_y) +// +// The x coordinate is either 0, sqrt(-1), lop_x, or their opposite. +// The y coordinate is either 0, -1 , lop_y, or their opposite. +// The pattern for both is the same, except for a rotation of 2 (modulo 8) +// +// This helper function captures the pattern, and we can use it thus: +// +// select_lop(x, lop_x, sqrtm1, cofactor); +// select_lop(y, lop_y, fe_one, cofactor + 2); +// +// This is faster than an actual scalar multiplication, +// and requires less code than naive constant time look up. +static void select_lop(fe out, const fe x, const fe k, u8 cofactor) +{ + fe tmp; + fe_0(out); + fe_ccopy(out, k , (cofactor >> 1) & 1); // bit 1 + fe_ccopy(out, x , (cofactor >> 0) & 1); // bit 0 + fe_neg (tmp, out); + fe_ccopy(out, tmp, (cofactor >> 2) & 1); // bit 2 + WIPE_BUFFER(tmp); +} + +// "Fast" dirty ephemeral key +// We use this one by default. +// +// This version works by performing a regular scalar multiplication, +// then add a low order point. The scalar multiplication is done in +// Edwards space for more speed (*2 compared to the "small" version). +// The cost is a bigger binary for programs that don't also sign messages. +void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32]) +{ + // Compute clean scalar multiplication + u8 scalar[32]; + ge pk; + crypto_eddsa_trim_scalar(scalar, secret_key); + ge_scalarmult_base(&pk, scalar); + + // Compute low order point + fe t1, t2; + select_lop(t1, lop_x, sqrtm1, secret_key[0]); + select_lop(t2, lop_y, fe_one, secret_key[0] + 2); + ge_precomp low_order_point; + fe_add(low_order_point.Yp, t2, t1); + fe_sub(low_order_point.Ym, t2, t1); + fe_mul(low_order_point.T2, t2, t1); + fe_mul(low_order_point.T2, low_order_point.T2, D2); + + // Add low order point to the public key + ge_madd(&pk, &pk, &low_order_point, t1, t2); + + // Convert to Montgomery u coordinate (we ignore the sign) + fe_add(t1, pk.Z, pk.Y); + fe_sub(t2, pk.Z, pk.Y); + fe_invert(t2, t2); + fe_mul(t1, t1, t2); + + fe_tobytes(public_key, t1); + + WIPE_BUFFER(t1); WIPE_CTX(&pk); + WIPE_BUFFER(t2); WIPE_CTX(&low_order_point); + WIPE_BUFFER(scalar); +} + +/////////////////// +/// Elligator 2 /// +/////////////////// +static const fe A = {486662}; + +// Elligator direct map +// +// Computes the point corresponding to a representative, encoded in 32 +// bytes (little Endian). Since positive representatives fits in 254 +// bits, The two most significant bits are ignored. +// +// From the paper: +// w = -A / (fe(1) + non_square * r^2) +// e = chi(w^3 + A*w^2 + w) +// u = e*w - (fe(1)-e)*(A//2) +// v = -e * sqrt(u^3 + A*u^2 + u) +// +// We ignore v because we don't need it for X25519 (the Montgomery +// ladder only uses u). +// +// Note that e is either 0, 1 or -1 +// if e = 0 u = 0 and v = 0 +// if e = 1 u = w +// if e = -1 u = -w - A = w * non_square * r^2 +// +// Let r1 = non_square * r^2 +// Let r2 = 1 + r1 +// Note that r2 cannot be zero, -1/non_square is not a square. +// We can (tediously) verify that: +// w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3 +// Therefore: +// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) +// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1 +// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6) +// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3) * r2^6) +// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * A * r2^3) +// Corollary: +// e = 1 if (A^2*r1 - r2^2) * A * r2^3) is a non-zero square +// e = -1 if (A^2*r1 - r2^2) * A * r2^3) is not a square +// Note that w^3 + A*w^2 + w (and therefore e) can never be zero: +// w^3 + A*w^2 + w = w * (w^2 + A*w + 1) +// w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1) +// w^3 + A*w^2 + w = w * (w + A/2)^2 - A^2/4 + 1) +// which is zero only if: +// w = 0 (impossible) +// (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square) +// +// Let isr = invsqrt((A^2*r1 - r2^2) * A * r2^3) +// isr = sqrt(1 / ((A^2*r1 - r2^2) * A * r2^3)) if e = 1 +// isr = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) * A * r2^3)) if e = -1 +// +// if e = 1 +// let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 +// u1 = w +// u1 = u +// +// if e = -1 +// let ufactor = -non_square * sqrt(-1) * r^2 +// let vfactor = sqrt(ufactor) +// let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor +// u2 = w * -1 * -non_square * r^2 +// u2 = w * non_square * r^2 +// u2 = u +void crypto_elligator_map(u8 curve[32], const u8 hidden[32]) +{ + fe r, u, t1, t2, t3; + fe_frombytes_mask(r, hidden, 2); // r is encoded in 254 bits. + fe_sq(r, r); + fe_add(t1, r, r); + fe_add(u, t1, fe_one); + fe_sq (t2, u); + fe_mul(t3, A2, t1); + fe_sub(t3, t3, t2); + fe_mul(t3, t3, A); + fe_mul(t1, t2, u); + fe_mul(t1, t3, t1); + int is_square = invsqrt(t1, t1); + fe_mul(u, r, ufactor); + fe_ccopy(u, fe_one, is_square); + fe_sq (t1, t1); + fe_mul(u, u, A); + fe_mul(u, u, t3); + fe_mul(u, u, t2); + fe_mul(u, u, t1); + fe_neg(u, u); + fe_tobytes(curve, u); + + WIPE_BUFFER(t1); WIPE_BUFFER(r); + WIPE_BUFFER(t2); WIPE_BUFFER(u); + WIPE_BUFFER(t3); +} + +// Elligator inverse map +// +// Computes the representative of a point, if possible. If not, it does +// nothing and returns -1. Note that the success of the operation +// depends only on the point (more precisely its u coordinate). The +// tweak parameter is used only upon success +// +// The tweak should be a random byte. Beyond that, its contents are an +// implementation detail. Currently, the tweak comprises: +// - Bit 1 : sign of the v coordinate (0 if positive, 1 if negative) +// - Bit 2-5: not used +// - Bits 6-7: random padding +// +// From the paper: +// Let sq = -non_square * u * (u+A) +// if sq is not a square, or u = -A, there is no mapping +// Assuming there is a mapping: +// if v is positive: r = sqrt(-u / (non_square * (u+A))) +// if v is negative: r = sqrt(-(u+A) / (non_square * u )) +// +// We compute isr = invsqrt(-non_square * u * (u+A)) +// if it wasn't a square, abort. +// else, isr = sqrt(-1 / (non_square * u * (u+A)) +// +// If v is positive, we return isr * u: +// isr * u = sqrt(-1 / (non_square * u * (u+A)) * u +// isr * u = sqrt(-u / (non_square * (u+A)) +// +// If v is negative, we return isr * (u+A): +// isr * (u+A) = sqrt(-1 / (non_square * u * (u+A)) * (u+A) +// isr * (u+A) = sqrt(-(u+A) / (non_square * u) +int crypto_elligator_rev(u8 hidden[32], const u8 public_key[32], u8 tweak) +{ + fe t1, t2, t3; + fe_frombytes(t1, public_key); // t1 = u + + fe_add(t2, t1, A); // t2 = u + A + fe_mul(t3, t1, t2); + fe_mul_small(t3, t3, -2); + int is_square = invsqrt(t3, t3); // t3 = sqrt(-1 / non_square * u * (u+A)) + if (is_square) { + // The only variable time bit. This ultimately reveals how many + // tries it took us to find a representable key. + // This does not affect security as long as we try keys at random. + + fe_ccopy (t1, t2, tweak & 1); // multiply by u if v is positive, + fe_mul (t3, t1, t3); // multiply by u+A otherwise + fe_mul_small(t1, t3, 2); + fe_neg (t2, t3); + fe_ccopy (t3, t2, fe_isodd(t1)); + fe_tobytes(hidden, t3); + + // Pad with two random bits + hidden[31] |= tweak & 0xc0; + } + + WIPE_BUFFER(t1); + WIPE_BUFFER(t2); + WIPE_BUFFER(t3); + return is_square - 1; +} + +void crypto_elligator_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32]) +{ + u8 pk [32]; // public key + u8 buf[64]; // seed + representative + COPY(buf + 32, seed, 32); + do { + crypto_chacha20_djb(buf, 0, 64, buf+32, zero, 0); + crypto_x25519_dirty_fast(pk, buf); // or the "small" version + } while(crypto_elligator_rev(buf+32, pk, buf[32])); + // Note that the return value of crypto_elligator_rev() is + // independent from its tweak parameter. + // Therefore, buf[32] is not actually reused. Either we loop one + // more time and buf[32] is used for the new seed, or we succeeded, + // and buf[32] becomes the tweak parameter. + + crypto_wipe(seed, 32); + COPY(hidden , buf + 32, 32); + COPY(secret_key, buf , 32); + WIPE_BUFFER(buf); + WIPE_BUFFER(pk); +} + +/////////////////////// +/// Scalar division /// +/////////////////////// + +// Montgomery reduction. +// Divides x by (2^256), and reduces the result modulo L +// +// Precondition: +// x < L * 2^256 +// Constants: +// r = 2^256 (makes division by r trivial) +// k = (r * (1/r) - 1) // L (1/r is computed modulo L ) +// Algorithm: +// s = (x * k) % r +// t = x + s*L (t is always a multiple of r) +// u = (t/r) % L (u is always below 2*L, conditional subtraction is enough) +static void redc(u32 u[8], u32 x[16]) +{ + static const u32 k[8] = { + 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2, + 0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2, + }; + + // s = x * k (modulo 2^256) + // This is cheaper than the full multiplication. + u32 s[8] = {0}; + FOR (i, 0, 8) { + u64 carry = 0; + FOR (j, 0, 8-i) { + carry += s[i+j] + (u64)x[i] * k[j]; + s[i+j] = (u32)carry; + carry >>= 32; + } + } + u32 t[16] = {0}; + multiply(t, s, L); + + // t = t + x + u64 carry = 0; + FOR (i, 0, 16) { + carry += (u64)t[i] + x[i]; + t[i] = (u32)carry; + carry >>= 32; + } + + // u = (t / 2^256) % L + // Note that t / 2^256 is always below 2*L, + // So a constant time conditional subtraction is enough + remove_l(u, t+8); + + WIPE_BUFFER(s); + WIPE_BUFFER(t); +} + +void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32], + const u8 curve_point[32]) +{ + static const u8 Lm2[32] = { // L - 2 + 0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, + 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, + }; + // 1 in Montgomery form + u32 m_inv [8] = { + 0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4, + 0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff, + }; + + u8 scalar[32]; + crypto_eddsa_trim_scalar(scalar, private_key); + + // Convert the scalar in Montgomery form + // m_scl = scalar * 2^256 (modulo L) + u32 m_scl[8]; + { + u32 tmp[16]; + ZERO(tmp, 8); + load32_le_buf(tmp+8, scalar, 8); + mod_l(scalar, tmp); + load32_le_buf(m_scl, scalar, 8); + WIPE_BUFFER(tmp); // Wipe ASAP to save stack space + } + + // Compute the inverse + u32 product[16]; + for (int i = 252; i >= 0; i--) { + ZERO(product, 16); + multiply(product, m_inv, m_inv); + redc(m_inv, product); + if (scalar_bit(Lm2, i)) { + ZERO(product, 16); + multiply(product, m_inv, m_scl); + redc(m_inv, product); + } + } + // Convert the inverse *out* of Montgomery form + // scalar = m_inv / 2^256 (modulo L) + COPY(product, m_inv, 8); + ZERO(product + 8, 8); + redc(m_inv, product); + store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar + + // Clear the cofactor of scalar: + // cleared = scalar * (3*L + 1) (modulo 8*L) + // cleared = scalar + scalar * 3 * L (modulo 8*L) + // Note that (scalar * 3) is reduced modulo 8, so we only need the + // first byte. + add_xl(scalar, scalar[0] * 3); + + // Recall that 8*L < 2^256. However it is also very close to + // 2^255. If we spanned the ladder over 255 bits, random tests + // wouldn't catch the off-by-one error. + scalarmult(blind_salt, scalar, curve_point, 256); + + WIPE_BUFFER(scalar); WIPE_BUFFER(m_scl); + WIPE_BUFFER(product); WIPE_BUFFER(m_inv); +} + +//////////////////////////////// +/// Authenticated encryption /// +//////////////////////////////// +static void lock_auth(u8 mac[16], const u8 auth_key[32], + const u8 *ad , size_t ad_size, + const u8 *cipher_text, size_t text_size) +{ + u8 sizes[16]; // Not secret, not wiped + store64_le(sizes + 0, ad_size); + store64_le(sizes + 8, text_size); + crypto_poly1305_ctx poly_ctx; // auto wiped... + crypto_poly1305_init (&poly_ctx, auth_key); + crypto_poly1305_update(&poly_ctx, ad , ad_size); + crypto_poly1305_update(&poly_ctx, zero , gap(ad_size, 16)); + crypto_poly1305_update(&poly_ctx, cipher_text, text_size); + crypto_poly1305_update(&poly_ctx, zero , gap(text_size, 16)); + crypto_poly1305_update(&poly_ctx, sizes , 16); + crypto_poly1305_final (&poly_ctx, mac); // ...here +} + +void crypto_aead_init_x(crypto_aead_ctx *ctx, + u8 const key[32], const u8 nonce[24]) +{ + crypto_chacha20_h(ctx->key, key, nonce); + COPY(ctx->nonce, nonce + 16, 8); + ctx->counter = 0; +} + +void crypto_aead_init_djb(crypto_aead_ctx *ctx, + const u8 key[32], const u8 nonce[8]) +{ + COPY(ctx->key , key , 32); + COPY(ctx->nonce, nonce, 8); + ctx->counter = 0; +} + +void crypto_aead_init_ietf(crypto_aead_ctx *ctx, + const u8 key[32], const u8 nonce[12]) +{ + COPY(ctx->key , key , 32); + COPY(ctx->nonce, nonce + 4, 8); + ctx->counter = (u64)load32_le(nonce) << 32; +} + +void crypto_aead_write(crypto_aead_ctx *ctx, u8 *cipher_text, u8 mac[16], + const u8 *ad, size_t ad_size, + const u8 *plain_text, size_t text_size) +{ + u8 auth_key[64]; // the last 32 bytes are used for rekeying. + crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter); + crypto_chacha20_djb(cipher_text, plain_text, text_size, + ctx->key, ctx->nonce, ctx->counter + 1); + lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size); + COPY(ctx->key, auth_key + 32, 32); + WIPE_BUFFER(auth_key); +} + +int crypto_aead_read(crypto_aead_ctx *ctx, u8 *plain_text, const u8 mac[16], + const u8 *ad, size_t ad_size, + const u8 *cipher_text, size_t text_size) +{ + u8 auth_key[64]; // the last 32 bytes are used for rekeying. + u8 real_mac[16]; + crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter); + lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size); + int mismatch = crypto_verify16(mac, real_mac); + if (!mismatch) { + crypto_chacha20_djb(plain_text, cipher_text, text_size, + ctx->key, ctx->nonce, ctx->counter + 1); + COPY(ctx->key, auth_key + 32, 32); + } + WIPE_BUFFER(auth_key); + WIPE_BUFFER(real_mac); + return mismatch; +} + +void crypto_aead_lock(u8 *cipher_text, u8 mac[16], const u8 key[32], + const u8 nonce[24], const u8 *ad, size_t ad_size, + const u8 *plain_text, size_t text_size) +{ + crypto_aead_ctx ctx; + crypto_aead_init_x(&ctx, key, nonce); + crypto_aead_write(&ctx, cipher_text, mac, ad, ad_size, + plain_text, text_size); + crypto_wipe(&ctx, sizeof(ctx)); +} + +int crypto_aead_unlock(u8 *plain_text, const u8 mac[16], const u8 key[32], + const u8 nonce[24], const u8 *ad, size_t ad_size, + const u8 *cipher_text, size_t text_size) +{ + crypto_aead_ctx ctx; + crypto_aead_init_x(&ctx, key, nonce); + int mismatch = crypto_aead_read(&ctx, plain_text, mac, ad, ad_size, + cipher_text, text_size); + crypto_wipe(&ctx, sizeof(ctx)); + return mismatch; +} + +#ifdef MONOCYPHER_CPP_NAMESPACE +} +#endif diff --git a/driver/dist/vendor/monocypher/monocypher.h b/driver/dist/vendor/monocypher/monocypher.h @@ -0,0 +1,321 @@ +// Monocypher version 4.0.2 +// +// This file is dual-licensed. Choose whichever licence you want from +// the two licences listed below. +// +// The first licence is a regular 2-clause BSD licence. The second licence +// is the CC-0 from Creative Commons. It is intended to release Monocypher +// to the public domain. The BSD licence serves as a fallback option. +// +// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0 +// +// ------------------------------------------------------------------------ +// +// Copyright (c) 2017-2019, Loup Vaillant +// All rights reserved. +// +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ------------------------------------------------------------------------ +// +// Written in 2017-2019 by Loup Vaillant +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along +// with this software. If not, see +// <https://creativecommons.org/publicdomain/zero/1.0/> + +#ifndef MONOCYPHER_H +#define MONOCYPHER_H + +#include <stddef.h> +#include <stdint.h> + +#ifdef MONOCYPHER_CPP_NAMESPACE +namespace MONOCYPHER_CPP_NAMESPACE { +#elif defined(__cplusplus) +extern "C" { +#endif + +// Constant time comparisons +// ------------------------- + +// Return 0 if a and b are equal, -1 otherwise +int crypto_verify16(const uint8_t a[16], const uint8_t b[16]); +int crypto_verify32(const uint8_t a[32], const uint8_t b[32]); +int crypto_verify64(const uint8_t a[64], const uint8_t b[64]); + + +// Erase sensitive data +// -------------------- +void crypto_wipe(void *secret, size_t size); + + +// Authenticated encryption +// ------------------------ +void crypto_aead_lock(uint8_t *cipher_text, + uint8_t mac [16], + const uint8_t key [32], + const uint8_t nonce[24], + const uint8_t *ad, size_t ad_size, + const uint8_t *plain_text, size_t text_size); +int crypto_aead_unlock(uint8_t *plain_text, + const uint8_t mac [16], + const uint8_t key [32], + const uint8_t nonce[24], + const uint8_t *ad, size_t ad_size, + const uint8_t *cipher_text, size_t text_size); + +// Authenticated stream +// -------------------- +typedef struct { + uint64_t counter; + uint8_t key[32]; + uint8_t nonce[8]; +} crypto_aead_ctx; + +void crypto_aead_init_x(crypto_aead_ctx *ctx, + const uint8_t key[32], const uint8_t nonce[24]); +void crypto_aead_init_djb(crypto_aead_ctx *ctx, + const uint8_t key[32], const uint8_t nonce[8]); +void crypto_aead_init_ietf(crypto_aead_ctx *ctx, + const uint8_t key[32], const uint8_t nonce[12]); + +void crypto_aead_write(crypto_aead_ctx *ctx, + uint8_t *cipher_text, + uint8_t mac[16], + const uint8_t *ad , size_t ad_size, + const uint8_t *plain_text, size_t text_size); +int crypto_aead_read(crypto_aead_ctx *ctx, + uint8_t *plain_text, + const uint8_t mac[16], + const uint8_t *ad , size_t ad_size, + const uint8_t *cipher_text, size_t text_size); + + +// General purpose hash (BLAKE2b) +// ------------------------------ + +// Direct interface +void crypto_blake2b(uint8_t *hash, size_t hash_size, + const uint8_t *message, size_t message_size); + +void crypto_blake2b_keyed(uint8_t *hash, size_t hash_size, + const uint8_t *key, size_t key_size, + const uint8_t *message, size_t message_size); + +// Incremental interface +typedef struct { + // Do not rely on the size or contents of this type, + // for they may change without notice. + uint64_t hash[8]; + uint64_t input_offset[2]; + uint64_t input[16]; + size_t input_idx; + size_t hash_size; +} crypto_blake2b_ctx; + +void crypto_blake2b_init(crypto_blake2b_ctx *ctx, size_t hash_size); +void crypto_blake2b_keyed_init(crypto_blake2b_ctx *ctx, size_t hash_size, + const uint8_t *key, size_t key_size); +void crypto_blake2b_update(crypto_blake2b_ctx *ctx, + const uint8_t *message, size_t message_size); +void crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *hash); + + +// Password key derivation (Argon2) +// -------------------------------- +#define CRYPTO_ARGON2_D 0 +#define CRYPTO_ARGON2_I 1 +#define CRYPTO_ARGON2_ID 2 + +typedef struct { + uint32_t algorithm; // Argon2d, Argon2i, Argon2id + uint32_t nb_blocks; // memory hardness, >= 8 * nb_lanes + uint32_t nb_passes; // CPU hardness, >= 1 (>= 3 recommended for Argon2i) + uint32_t nb_lanes; // parallelism level (single threaded anyway) +} crypto_argon2_config; + +typedef struct { + const uint8_t *pass; + const uint8_t *salt; + uint32_t pass_size; + uint32_t salt_size; // 16 bytes recommended +} crypto_argon2_inputs; + +typedef struct { + const uint8_t *key; // may be NULL if no key + const uint8_t *ad; // may be NULL if no additional data + uint32_t key_size; // 0 if no key (32 bytes recommended otherwise) + uint32_t ad_size; // 0 if no additional data +} crypto_argon2_extras; + +extern const crypto_argon2_extras crypto_argon2_no_extras; + +void crypto_argon2(uint8_t *hash, uint32_t hash_size, void *work_area, + crypto_argon2_config config, + crypto_argon2_inputs inputs, + crypto_argon2_extras extras); + + +// Key exchange (X-25519) +// ---------------------- + +// Shared secrets are not quite random. +// Hash them to derive an actual shared key. +void crypto_x25519_public_key(uint8_t public_key[32], + const uint8_t secret_key[32]); +void crypto_x25519(uint8_t raw_shared_secret[32], + const uint8_t your_secret_key [32], + const uint8_t their_public_key [32]); + +// Conversion to EdDSA +void crypto_x25519_to_eddsa(uint8_t eddsa[32], const uint8_t x25519[32]); + +// scalar "division" +// Used for OPRF. Be aware that exponential blinding is less secure +// than Diffie-Hellman key exchange. +void crypto_x25519_inverse(uint8_t blind_salt [32], + const uint8_t private_key[32], + const uint8_t curve_point[32]); + +// "Dirty" versions of x25519_public_key(). +// Use with crypto_elligator_rev(). +// Leaks 3 bits of the private key. +void crypto_x25519_dirty_small(uint8_t pk[32], const uint8_t sk[32]); +void crypto_x25519_dirty_fast (uint8_t pk[32], const uint8_t sk[32]); + + +// Signatures +// ---------- + +// EdDSA with curve25519 + BLAKE2b +void crypto_eddsa_key_pair(uint8_t secret_key[64], + uint8_t public_key[32], + uint8_t seed[32]); +void crypto_eddsa_sign(uint8_t signature [64], + const uint8_t secret_key[64], + const uint8_t *message, size_t message_size); +int crypto_eddsa_check(const uint8_t signature [64], + const uint8_t public_key[32], + const uint8_t *message, size_t message_size); + +// Conversion to X25519 +void crypto_eddsa_to_x25519(uint8_t x25519[32], const uint8_t eddsa[32]); + +// EdDSA building blocks +void crypto_eddsa_trim_scalar(uint8_t out[32], const uint8_t in[32]); +void crypto_eddsa_reduce(uint8_t reduced[32], const uint8_t expanded[64]); +void crypto_eddsa_mul_add(uint8_t r[32], + const uint8_t a[32], + const uint8_t b[32], + const uint8_t c[32]); +void crypto_eddsa_scalarbase(uint8_t point[32], const uint8_t scalar[32]); +int crypto_eddsa_check_equation(const uint8_t signature[64], + const uint8_t public_key[32], + const uint8_t h_ram[32]); + + +// Chacha20 +// -------- + +// Specialised hash. +// Used to hash X25519 shared secrets. +void crypto_chacha20_h(uint8_t out[32], + const uint8_t key[32], + const uint8_t in [16]); + +// Unauthenticated stream cipher. +// Don't forget to add authentication. +uint64_t crypto_chacha20_djb(uint8_t *cipher_text, + const uint8_t *plain_text, + size_t text_size, + const uint8_t key[32], + const uint8_t nonce[8], + uint64_t ctr); +uint32_t crypto_chacha20_ietf(uint8_t *cipher_text, + const uint8_t *plain_text, + size_t text_size, + const uint8_t key[32], + const uint8_t nonce[12], + uint32_t ctr); +uint64_t crypto_chacha20_x(uint8_t *cipher_text, + const uint8_t *plain_text, + size_t text_size, + const uint8_t key[32], + const uint8_t nonce[24], + uint64_t ctr); + + +// Poly 1305 +// --------- + +// This is a *one time* authenticator. +// Disclosing the mac reveals the key. +// See crypto_lock() on how to use it properly. + +// Direct interface +void crypto_poly1305(uint8_t mac[16], + const uint8_t *message, size_t message_size, + const uint8_t key[32]); + +// Incremental interface +typedef struct { + // Do not rely on the size or contents of this type, + // for they may change without notice. + uint8_t c[16]; // chunk of the message + size_t c_idx; // How many bytes are there in the chunk. + uint32_t r [4]; // constant multiplier (from the secret key) + uint32_t pad[4]; // random number added at the end (from the secret key) + uint32_t h [5]; // accumulated hash +} crypto_poly1305_ctx; + +void crypto_poly1305_init (crypto_poly1305_ctx *ctx, const uint8_t key[32]); +void crypto_poly1305_update(crypto_poly1305_ctx *ctx, + const uint8_t *message, size_t message_size); +void crypto_poly1305_final (crypto_poly1305_ctx *ctx, uint8_t mac[16]); + + +// Elligator 2 +// ----------- + +// Elligator mappings proper +void crypto_elligator_map(uint8_t curve [32], const uint8_t hidden[32]); +int crypto_elligator_rev(uint8_t hidden[32], const uint8_t curve [32], + uint8_t tweak); + +// Easy to use key pair generation +void crypto_elligator_key_pair(uint8_t hidden[32], uint8_t secret_key[32], + uint8_t seed[32]); + +#ifdef __cplusplus +} +#endif + +#endif // MONOCYPHER_H diff --git a/driver/pkg.c b/driver/pkg.c @@ -329,7 +329,7 @@ static int pkg_build_native_regions(const CfreeContext* ctx, PkgInputFile* in, size_t stored_len = 0; if (dist_lz4_compress_block(tmp, sizeof tmp, &stored_len, raw, raw_len) != DIST_OK) { - driver_errf(PKG_TOOL, "create: lz4-block-v1 is stubbed"); + driver_errf(PKG_TOOL, "create: lz4-block-v1 compression failed"); return DIST_ERR; } r.stored_size = stored_len; diff --git a/test/pkg/run.sh b/test/pkg/run.sh @@ -0,0 +1,92 @@ +#!/bin/sh +# Driver-level checks for portable package gzip/DEFLATE handling. + +set -u + +script_dir=$(cd "$(dirname "$0")" && pwd) +repo_root=$(cd "$script_dir/../.." && pwd) + +CFREE="${CFREE:-$repo_root/build/cfree}" + +if [ ! -x "$CFREE" ]; then + echo "pkg: cfree binary not found at $CFREE" >&2 + exit 2 +fi + +work=$(mktemp -d "${TMPDIR:-/tmp}/cfree-pkg-test.XXXXXX") +trap 'rm -rf "$work"' EXIT + +pass=0 +fail=0 + +ok() { + printf 'PASS %s\n' "$1" + pass=$((pass + 1)) +} + +not_ok() { + printf 'FAIL %s\n' "$1" + if [ "$#" -gt 1 ] && [ -s "$2" ]; then + sed 's/^/ | /' "$2" + fi + fail=$((fail + 1)) +} + +mkdir -p "$work/in" +{ + i=0 + while [ "$i" -lt 4096 ]; do + printf 'portable package deflate regression line %04d\n' "$i" + i=$((i + 1)) + done +} > "$work/in/payload.txt" + +if "$CFREE" pkg keygen -o "$work/key" > "$work/keygen.out" 2> "$work/keygen.err"; then + ok "pkg-keygen" +else + not_ok "pkg-keygen" "$work/keygen.err" +fi + +if "$CFREE" pkg create --name deflate-smoke --version 1.0.0 \ + --format tar.gz -s "$work/key.key" -o "$work/pkg.tar.gz" \ + "$work/in/payload.txt" > "$work/create.out" 2> "$work/create.err"; then + ok "pkg-create-targz-deflate" +else + not_ok "pkg-create-targz-deflate" "$work/create.err" +fi + +if gzip -t "$work/pkg.tar.gz" > "$work/gzip-test.out" 2> "$work/gzip-test.err"; then + ok "host-gzip-accepts-cfree-output" +else + not_ok "host-gzip-accepts-cfree-output" "$work/gzip-test.err" +fi + +if "$CFREE" pkg verify -p "$work/key.pub" "$work/pkg.tar.gz" \ + > "$work/verify.out" 2> "$work/verify.err"; then + ok "pkg-verify-cfree-gzip" +else + not_ok "pkg-verify-cfree-gzip" "$work/verify.err" +fi + +if gunzip -c "$work/pkg.tar.gz" > "$work/pkg.tar" 2> "$work/gunzip.err" && + gzip -c "$work/pkg.tar" > "$work/host.tar.gz" 2> "$work/regzip.err" && + "$CFREE" pkg inspect "$work/host.tar.gz" \ + > "$work/inspect-host.out" 2> "$work/inspect-host.err"; then + ok "pkg-inspect-host-gzip" +else + not_ok "pkg-inspect-host-gzip" "$work/inspect-host.err" +fi + +if "$CFREE" pkg verify -p "$work/key.pub" "$work/host.tar.gz" \ + > "$work/verify-host.out" 2> "$work/verify-host.err"; then + ok "pkg-verify-host-gzip" +else + not_ok "pkg-verify-host-gzip" "$work/verify-host.err" +fi + +if [ "$fail" -ne 0 ]; then + printf 'pkg: %d passed, %d failed\n' "$pass" "$fail" + exit 1 +fi + +printf 'pkg: %d passed\n' "$pass" diff --git a/test/test.mk b/test/test.mk @@ -47,6 +47,7 @@ TEST_TARGETS = \ test-driver-cc \ test-driver-objcopy \ test-driver-objdump \ + test-driver-pkg \ test-driver-strings \ test-driver-strip \ test-dwarf \ @@ -111,7 +112,7 @@ DEFAULT_TEST_TARGETS = \ test: $(DEFAULT_TEST_TARGETS) -test-driver: test-driver-cc test-driver-ar test-driver-strip test-driver-objcopy test-driver-objdump test-driver-strings +test-driver: test-driver-cc test-driver-ar test-driver-strip test-driver-objcopy test-driver-objdump test-driver-pkg test-driver-strings test-driver-cc: bin @CFREE=$(abspath $(BIN)) sh test/driver/run.sh @@ -185,6 +186,9 @@ test-driver-objcopy: bin test-driver-objdump: bin @CFREE=$(abspath $(BIN)) sh test/objdump/run.sh +test-driver-pkg: bin + @CFREE=$(abspath $(BIN)) sh test/pkg/run.sh + test-driver-strings: bin @CFREE=$(abspath $(BIN)) sh test/strings/run.sh