/* ======================================================================== */ /* * Saturnin-CTR-Cascade (NIST API). * * bs32x: this implementation expands each Saturnin 16-bit register into * the even bits of a 32-bit register. The odd bits can be used to run * another instance in parallel. */ #include "crypto_aead.h" #include #include /* * We represent the sixteen 16-bit registers r0..r15 into sixteen 32-bit * variables h0, h1,... hF, using the even bits (bit i of rj becomes * bit 2*i of hj). The odd bits optionally receive another instance. */ #define DECL_STATE_X2 \ uint32_t h0, h1, h2, h3, h4, h5, h6, h7; \ uint32_t h8, h9, hA, hB, hC, hD, hE, hF; #define EXPAND(x) do { \ uint32_t xp = (x); \ xp = (xp & 0x000000FF) | ((xp & 0x0000FF00) << 8); \ xp = (xp & 0x000F000F) | ((xp & 0x00F000F0) << 4); \ xp = (xp & 0x03030303) | ((xp & 0x0C0C0C0C) << 2); \ xp = (xp & 0x11111111) | ((xp & 0x22222222) << 1); \ (x) = xp; \ } while (0) #define DEC256_INNER(d, src) do { \ d ## 0 = (uint32_t)(src)[ 0] | ((uint32_t)(src)[ 1] << 8); \ d ## 1 = (uint32_t)(src)[ 2] | ((uint32_t)(src)[ 3] << 8); \ d ## 2 = (uint32_t)(src)[ 4] | ((uint32_t)(src)[ 5] << 8); \ d ## 3 = (uint32_t)(src)[ 6] | ((uint32_t)(src)[ 7] << 8); \ d ## 4 = (uint32_t)(src)[ 8] | ((uint32_t)(src)[ 9] << 8); \ d ## 5 = (uint32_t)(src)[10] | ((uint32_t)(src)[11] << 8); \ d ## 6 = (uint32_t)(src)[12] | ((uint32_t)(src)[13] << 8); \ d ## 7 = (uint32_t)(src)[14] | ((uint32_t)(src)[15] << 8); \ d ## 8 = (uint32_t)(src)[16] | ((uint32_t)(src)[17] << 8); \ d ## 9 = (uint32_t)(src)[18] | ((uint32_t)(src)[19] << 8); \ d ## A = (uint32_t)(src)[20] | ((uint32_t)(src)[21] << 8); \ d ## B = (uint32_t)(src)[22] | ((uint32_t)(src)[23] << 8); \ d ## C = (uint32_t)(src)[24] | ((uint32_t)(src)[25] << 8); \ d ## D = (uint32_t)(src)[26] | ((uint32_t)(src)[27] << 8); \ d ## E = (uint32_t)(src)[28] | ((uint32_t)(src)[29] << 8); \ d ## F = (uint32_t)(src)[30] | ((uint32_t)(src)[31] << 8); \ EXPAND(d ## 0); \ EXPAND(d ## 1); \ EXPAND(d ## 2); \ EXPAND(d ## 3); \ EXPAND(d ## 4); \ EXPAND(d ## 5); \ EXPAND(d ## 6); \ EXPAND(d ## 7); \ EXPAND(d ## 8); \ EXPAND(d ## 9); \ EXPAND(d ## A); \ EXPAND(d ## B); \ EXPAND(d ## C); \ EXPAND(d ## D); \ EXPAND(d ## E); \ EXPAND(d ## F); \ } while (0) #define DEC256(src) do { \ DEC256_INNER(h, src); \ } while (0) #define DEC512(src) do { \ uint32_t m0, m1, m2, m3, m4, m5, m6, m7; \ uint32_t m8, m9, mA, mB, mC, mD, mE, mF; \ DEC256_INNER(h, src); \ DEC256_INNER(m, (src) + 32); \ h0 |= m0 << 1; \ h1 |= m1 << 1; \ h2 |= m2 << 1; \ h3 |= m3 << 1; \ h4 |= m4 << 1; \ h5 |= m5 << 1; \ h6 |= m6 << 1; \ h7 |= m7 << 1; \ h8 |= m8 << 1; \ h9 |= m9 << 1; \ hA |= mA << 1; \ hB |= mB << 1; \ hC |= mC << 1; \ hD |= mD << 1; \ hE |= mE << 1; \ hF |= mF << 1; \ } while (0) #define ENC16(x, dst) do { \ uint32_t xp = (x); \ xp = (xp & 0x11111111) | ((xp & 0x44444444) >> 1); \ xp = (xp & 0x03030303) | ((xp & 0x30303030) >> 2); \ xp = (xp & 0x000F000F) | ((xp & 0x0F000F00) >> 4); \ (dst)[ 0] = (uint8_t)xp; \ (dst)[ 1] = (uint8_t)(xp >> 16); \ } while (0) #define ENC256(dst) do { \ ENC16(h0, (dst) + 0); \ ENC16(h1, (dst) + 2); \ ENC16(h2, (dst) + 4); \ ENC16(h3, (dst) + 6); \ ENC16(h4, (dst) + 8); \ ENC16(h5, (dst) + 10); \ ENC16(h6, (dst) + 12); \ ENC16(h7, (dst) + 14); \ ENC16(h8, (dst) + 16); \ ENC16(h9, (dst) + 18); \ ENC16(hA, (dst) + 20); \ ENC16(hB, (dst) + 22); \ ENC16(hC, (dst) + 24); \ ENC16(hD, (dst) + 26); \ ENC16(hE, (dst) + 28); \ ENC16(hF, (dst) + 30); \ } while (0) #define ENC512(dst) do { \ ENC256(dst); \ h0 >>= 1; \ h1 >>= 1; \ h2 >>= 1; \ h3 >>= 1; \ h4 >>= 1; \ h5 >>= 1; \ h6 >>= 1; \ h7 >>= 1; \ h8 >>= 1; \ h9 >>= 1; \ hA >>= 1; \ hB >>= 1; \ hC >>= 1; \ hD >>= 1; \ hE >>= 1; \ hF >>= 1; \ ENC256((dst) + 32); \ } while (0) #define SBOX_0_X2(z0, z1, z2, z3) do { \ uint32_t a, b, c, d; \ a = z0; \ b = z1; \ c = z2; \ d = z3; \ a ^= b & c; \ b ^= a | d; \ d ^= b | c; \ c ^= b & d; \ b ^= a | c; \ a ^= b | d; \ z0 = b; \ z1 = c; \ z2 = d; \ z3 = a; \ } while (0) #define SBOX_1_X2(z0, z1, z2, z3) do { \ uint32_t a, b, c, d; \ a = z0; \ b = z1; \ c = z2; \ d = z3; \ a ^= b & c; \ b ^= a | d; \ d ^= b | c; \ c ^= b & d; \ b ^= a | c; \ a ^= b | d; \ z0 = d; \ z1 = b; \ z2 = a; \ z3 = c; \ } while (0) #define SBOX_X2 do { \ SBOX_0_X2(h0, h1, h2, h3); \ SBOX_1_X2(h4, h5, h6, h7); \ SBOX_0_X2(h8, h9, hA, hB); \ SBOX_1_X2(hC, hD, hE, hF); \ } while (0) #define MDS_X2 do { \ /* q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7; */ \ h0 ^= h4; \ h1 ^= h5; \ h2 ^= h6; \ h3 ^= h7; \ h8 ^= hC; \ h9 ^= hD; \ hA ^= hE; \ hB ^= hF; \ /* MUL(q4, q5, q6, q7); */ \ h4 ^= h5; \ hC ^= hD; \ /* q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3); */ \ h5 ^= h8; \ h6 ^= h9; \ h7 ^= hA; \ h4 ^= hB; \ hD ^= h0; \ hE ^= h1; \ hF ^= h2; \ hC ^= h3; \ /* MUL(q0, q1, q2, q3); */ \ h0 ^= h1; \ h8 ^= h9; \ /* MUL(q0, q1, q2, q3); */ \ h1 ^= h2; \ h9 ^= hA; \ /* q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7; */ \ h2 ^= h5; \ h3 ^= h6; \ h0 ^= h7; \ h1 ^= h4; \ hA ^= hD; \ hB ^= hE; \ h8 ^= hF; \ h9 ^= hC; \ /* q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3); */ \ h5 ^= hA; \ h6 ^= hB; \ h7 ^= h8; \ h4 ^= h9; \ hD ^= h2; \ hE ^= h3; \ hF ^= h0; \ hC ^= h1; \ /* Some register movement to avoid renaming (this should \ be optimized out by the compiler). */ \ uint32_t tt; \ tt = h0; h0 = h2; h2 = tt; \ tt = h1; h1 = h3; h3 = tt; \ tt = h4; h4 = h5; h5 = h6; h6 = h7; h7 = tt; \ tt = h8; h8 = hA; hA = tt; \ tt = h9; h9 = hB; hB = tt; \ tt = hC; hC = hD; hD = hE; hE = hF; hF = tt; \ } while (0) #define SR_SLICE_X2 do { \ h4 = ((h4 & 0x3F3F3F3F) << 2) | ((h4 & 0xC0C0C0C0) >> 6); \ h5 = ((h5 & 0x3F3F3F3F) << 2) | ((h5 & 0xC0C0C0C0) >> 6); \ h6 = ((h6 & 0x3F3F3F3F) << 2) | ((h6 & 0xC0C0C0C0) >> 6); \ h7 = ((h7 & 0x3F3F3F3F) << 2) | ((h7 & 0xC0C0C0C0) >> 6); \ h8 = ((h8 & 0x0F0F0F0F) << 4) | ((h8 & 0xF0F0F0F0) >> 4); \ h9 = ((h9 & 0x0F0F0F0F) << 4) | ((h9 & 0xF0F0F0F0) >> 4); \ hA = ((hA & 0x0F0F0F0F) << 4) | ((hA & 0xF0F0F0F0) >> 4); \ hB = ((hB & 0x0F0F0F0F) << 4) | ((hB & 0xF0F0F0F0) >> 4); \ hC = ((hC & 0x03030303) << 6) | ((hC & 0xFCFCFCFC) >> 2); \ hD = ((hD & 0x03030303) << 6) | ((hD & 0xFCFCFCFC) >> 2); \ hE = ((hE & 0x03030303) << 6) | ((hE & 0xFCFCFCFC) >> 2); \ hF = ((hF & 0x03030303) << 6) | ((hF & 0xFCFCFCFC) >> 2); \ } while (0) #define SR_SLICE_X2_INV do { \ h4 = ((h4 & 0x03030303) << 6) | ((h4 & 0xFCFCFCFC) >> 2); \ h5 = ((h5 & 0x03030303) << 6) | ((h5 & 0xFCFCFCFC) >> 2); \ h6 = ((h6 & 0x03030303) << 6) | ((h6 & 0xFCFCFCFC) >> 2); \ h7 = ((h7 & 0x03030303) << 6) | ((h7 & 0xFCFCFCFC) >> 2); \ h8 = ((h8 & 0x0F0F0F0F) << 4) | ((h8 & 0xF0F0F0F0) >> 4); \ h9 = ((h9 & 0x0F0F0F0F) << 4) | ((h9 & 0xF0F0F0F0) >> 4); \ hA = ((hA & 0x0F0F0F0F) << 4) | ((hA & 0xF0F0F0F0) >> 4); \ hB = ((hB & 0x0F0F0F0F) << 4) | ((hB & 0xF0F0F0F0) >> 4); \ hC = ((hC & 0x3F3F3F3F) << 2) | ((hC & 0xC0C0C0C0) >> 6); \ hD = ((hD & 0x3F3F3F3F) << 2) | ((hD & 0xC0C0C0C0) >> 6); \ hE = ((hE & 0x3F3F3F3F) << 2) | ((hE & 0xC0C0C0C0) >> 6); \ hF = ((hF & 0x3F3F3F3F) << 2) | ((hF & 0xC0C0C0C0) >> 6); \ } while (0) #define SR_SHEET_X2 do { \ h4 = (h4 << 8) | (h4 >> 24); \ h5 = (h5 << 8) | (h5 >> 24); \ h6 = (h6 << 8) | (h6 >> 24); \ h7 = (h7 << 8) | (h7 >> 24); \ h8 = (h8 << 16) | (h8 >> 16); \ h9 = (h9 << 16) | (h9 >> 16); \ hA = (hA << 16) | (hA >> 16); \ hB = (hB << 16) | (hB >> 16); \ hC = (hC << 24) | (hC >> 8); \ hD = (hD << 24) | (hD >> 8); \ hE = (hE << 24) | (hE >> 8); \ hF = (hF << 24) | (hF >> 8); \ } while (0) #define SR_SHEET_X2_INV do { \ h4 = (h4 << 24) | (h4 >> 8); \ h5 = (h5 << 24) | (h5 >> 8); \ h6 = (h6 << 24) | (h6 >> 8); \ h7 = (h7 << 24) | (h7 >> 8); \ h8 = (h8 << 16) | (h8 >> 16); \ h9 = (h9 << 16) | (h9 >> 16); \ hA = (hA << 16) | (hA >> 16); \ hB = (hB << 16) | (hB >> 16); \ hC = (hC << 8) | (hC >> 24); \ hD = (hD << 8) | (hD >> 24); \ hE = (hE << 8) | (hE >> 24); \ hF = (hF << 8) | (hF >> 24); \ } while (0) #define XOR_KEY_X2 do { \ h0 ^= keybuf[0x00]; \ h1 ^= keybuf[0x01]; \ h2 ^= keybuf[0x02]; \ h3 ^= keybuf[0x03]; \ h4 ^= keybuf[0x04]; \ h5 ^= keybuf[0x05]; \ h6 ^= keybuf[0x06]; \ h7 ^= keybuf[0x07]; \ h8 ^= keybuf[0x08]; \ h9 ^= keybuf[0x09]; \ hA ^= keybuf[0x0A]; \ hB ^= keybuf[0x0B]; \ hC ^= keybuf[0x0C]; \ hD ^= keybuf[0x0D]; \ hE ^= keybuf[0x0E]; \ hF ^= keybuf[0x0F]; \ } while (0) #define XOR_KEY_ROTATED_X2 do { \ h0 ^= keybuf[0x10]; \ h1 ^= keybuf[0x11]; \ h2 ^= keybuf[0x12]; \ h3 ^= keybuf[0x13]; \ h4 ^= keybuf[0x14]; \ h5 ^= keybuf[0x15]; \ h6 ^= keybuf[0x16]; \ h7 ^= keybuf[0x17]; \ h8 ^= keybuf[0x18]; \ h9 ^= keybuf[0x19]; \ hA ^= keybuf[0x1A]; \ hB ^= keybuf[0x1B]; \ hC ^= keybuf[0x1C]; \ hD ^= keybuf[0x1D]; \ hE ^= keybuf[0x1E]; \ hF ^= keybuf[0x1F]; \ } while (0) /* * For Saturnin-CTR-Cascade: R = 10; D = 1, 2, 3, 4 or 5. */ static const uint32_t RC_10_2_X2[] = { 0x0C3CCF33, 0x30FC3033, 0x33FFFF00, 0xCC0F333C, 0x0C00F3C0, 0x00FFC0FF, 0xCFFCFC03, 0x00CF3330, 0x3CF0C3F3, 0x3FF303CC, 0x0CC000CC, 0x033FCC3C, 0xF0C3C03C, 0xCCCC303C, 0xC3003C0C, 0xF00303C3, 0x33F0F3FC, 0x03C00CF0, 0x330FFFFC, 0xCC0000F3 }; static const uint32_t RC_10_3_X2[] = { 0x0C3CC3C0, 0x30FC033C, 0x33CFCC03, 0xCF0C330F, 0xC3F33C33, 0x3CF0C0FF, 0xCCF30F00, 0x33C0033C, 0xF330FFCC, 0x3CC303FF, 0xCFF0FFC3, 0x3CCFFF33, 0x0F330C33, 0xFFC0FCCF, 0xFCF0FFCC, 0xCF0C03F3, 0xCC30033F, 0x3FCF0FF3, 0xC330CF30, 0xFF3C0CF0 }; static const uint32_t RC_10_combined_1_4_X2[] = { 0x0C3C728E, 0x30FECDAA, 0x312D022D, 0xE12231C1, 0xDCB60D06, 0x6466E2F5, 0xE26C412E, 0xFD54C18E, 0xEEB28BC1, 0x10230131, 0x6319272E, 0x43AD13AF, 0xF058127C, 0x3173EF8B, 0x839B52EE, 0xB99A2319, 0x31B94E48, 0x675903D7, 0xA14402D1, 0x336496D4 }; static const uint32_t RC_10_combined_1_5_X2[] = { 0x0C3C7A2C, 0x30FEEFA0, 0x310D202F, 0xE32031E3, 0x561487A4, 0x4C6CE2F5, 0xE066E32C, 0xDF5EE186, 0x6432A3EB, 0x12030113, 0xE1398D24, 0x690D31A5, 0x5AF89A76, 0x137B6729, 0xA93BD06E, 0x93902339, 0x9B39EECA, 0x4F5301D5, 0x016E2259, 0x114C9ED6 }; /* * Decode a key into 32-bit words (with bs32x encoding, and * followed by the rotated key); only the "even" keys are set (the odd bits * are set to zero). */ static void saturnin_x2_key_expand_even(uint32_t *keybuf, const uint8_t *key) { int i; for (i = 0; i < 16; i ++) { uint32_t w; w = (uint32_t)key[i << 1] | ((uint32_t)key[(i << 1) + 1] << 8); EXPAND(w); keybuf[i] = w; keybuf[i + 16] = (w << 22) | (w >> 10); } } /* * Perform two parallel Saturnin block encryptions. * R number of super-rounds * rc round constants (depends on R and D) * keybuf key and rotated key (16 words = 64 bytes) * buf blocks to encrypt * The encrypted block is written back in 'buf'. */ static void saturnin_x2_block_encrypt(int R, const uint32_t *rc, const uint32_t *keybuf, uint8_t *buf) { DECL_STATE_X2 int i; /* * Decode data into the registers. */ DEC512(buf); XOR_KEY_X2; /* * Run all rounds (two rounds per super-round, two super-rounds * per loop iteration). */ for (i = 0; i < R; i += 2) { /* * Even round. */ SBOX_X2; MDS_X2; /* * Odd round r = 1 mod 4. */ SBOX_X2; SR_SLICE_X2; MDS_X2; SR_SLICE_X2_INV; h0 ^= rc[(i << 1) + 0]; h8 ^= rc[(i << 1) + 1]; XOR_KEY_ROTATED_X2; /* * Even round. */ SBOX_X2; MDS_X2; /* * Odd round r = 3 mod 4. */ SBOX_X2; SR_SHEET_X2; MDS_X2; SR_SHEET_X2_INV; h0 ^= rc[(i << 1) + 2]; h8 ^= rc[(i << 1) + 3]; XOR_KEY_X2; } /* * Encode back the result. */ ENC512(buf); } /* * XOR 256-bit value a into 256-bit value d. The two arrays shall not * overlap. */ static inline void xor32(uint8_t *d, const uint8_t *a) { int i; for (i = 0; i < 32; i ++) { d[i] ^= a[i]; } } /* * Compute the Cascade construction on the AAD. This includes the * initialization step. The padded nonce is provided as input. */ static void do_cascade_aad(uint8_t *r, const uint32_t *keybuf, const uint8_t *nonce, const uint8_t *buf, size_t len) { uint8_t tmp[64]; uint32_t kb2[32]; size_t u, v, clen; memcpy(tmp, nonce, 32); memset(tmp + 32, 0, 32); saturnin_x2_block_encrypt(10, RC_10_2_X2, keybuf, tmp); xor32(tmp, nonce); for (u = 0; (u + 31) < len; u += 32) { saturnin_x2_key_expand_even(kb2, tmp); memcpy(tmp, buf + u, 32); saturnin_x2_block_encrypt(10, RC_10_2_X2, kb2, tmp); xor32(tmp, buf + u); } saturnin_x2_key_expand_even(kb2, tmp); clen = len - u; if (clen > 0) { memcpy(tmp, buf + u, clen); } tmp[clen] = 0x80; memset(tmp + clen + 1, 0, 31 - clen); saturnin_x2_block_encrypt(10, RC_10_3_X2, kb2, tmp); for (v = 0; v < clen; v ++) { tmp[v] ^= buf[u + v]; } tmp[clen] ^= 0x80; memcpy(r, tmp, 32); } int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k) { uint32_t keybuf[32], keybuf2[32]; uint8_t nonce[32], tag[32], tmp[64]; uint8_t *buf; size_t u, len; int i; /* * In this implementation, we limit the input length to less * than 2^32-3 blocks (i.e. about 137.4 gigabytes), which allows * us to keep the block counter on a single 32-bit integer. */ if ((mlen >> 5) >= 0xFFFFFFFD) { return -2; } len = (size_t)mlen; /* * Pad the nonce into a 32-byte block. */ (void)nsec; memcpy(nonce, npub, 16); nonce[16] = 0x80; memset(nonce + 17, 0, 15); /* * Move plaintext to ciphertext buffer. */ memmove(c, m, len); buf = (uint8_t *)c; /* * Expand the key. */ saturnin_x2_key_expand_even(keybuf, (const uint8_t *)k); /* * Process the AAD. */ do_cascade_aad(tag, keybuf, nonce, (const uint8_t *)ad, (size_t)adlen); /* * First CTR block and last Cascade block must be processed out * of the main loop, since the Cascade operates on the ciphertext. */ if (len >= 32) { memset(tmp + 32, 0, 32); memcpy(tmp, nonce, 32); tmp[31] = 0x01; saturnin_x2_block_encrypt(10, RC_10_combined_1_4_X2, keybuf, tmp); xor32(buf, tmp); memcpy(tmp + 32, tag, 32); /* * Each loop iteration expects the current Cascade state * in tmp[32..63]. */ for (u = 32;;) { uint32_t ctr; memcpy(tmp, nonce, 28); ctr = (u >> 5) + 1; tmp[28] = (uint8_t)(ctr >> 24); tmp[29] = (uint8_t)(ctr >> 16); tmp[30] = (uint8_t)(ctr >> 8); tmp[31] = (uint8_t)ctr; saturnin_x2_key_expand_even(keybuf2, tmp + 32); for (i = 0; i < 32; i ++) { keybuf2[i] = (keybuf2[i] << 1) | keybuf[i]; } memcpy(tmp + 32, buf + u - 32, 32); saturnin_x2_block_encrypt(10, RC_10_combined_1_4_X2, keybuf2, tmp); xor32(tmp + 32, buf + u - 32); if ((u + 31) < len) { xor32(buf + u, tmp); u += 32; } else { size_t v, rlen; rlen = len - u; for (v = 0; v < rlen; v ++) { buf[u + v] ^= tmp[v]; } memcpy(tmp, buf + u, rlen); tmp[rlen] = 0x80; memset(tmp + rlen + 1, 0, 31 - rlen); break; } } /* * On exit, the last partial ciphertext block, padded, * is in tmp[0..31], and the current Cascade state is * in tmp[32..63]. */ memcpy(tag, tmp, 32); saturnin_x2_key_expand_even(keybuf2, tmp + 32); for (i = 0; i < 32; i ++) { keybuf2[i] <<= 1; } memcpy(tmp + 32, tmp, 32); saturnin_x2_block_encrypt(10, RC_10_combined_1_5_X2, keybuf2, tmp); xor32(tag, tmp + 32); } else { memset(tmp + 32, 0, 32); memcpy(tmp, nonce, 32); tmp[31] = 0x01; saturnin_x2_block_encrypt(10, RC_10_combined_1_4_X2, keybuf, tmp); for (u = 0; u < len; u ++) { tmp[u] ^= buf[u]; } memcpy(buf, tmp, len); tmp[len] = 0x80; memset(tmp + len + 1, 0, 31 - len); saturnin_x2_key_expand_even(keybuf2, tag); for (i = 0; i < 32; i ++) { keybuf2[i] <<= 1; } memcpy(tag, tmp, 32); memcpy(tmp + 32, tmp, 32); saturnin_x2_block_encrypt(10, RC_10_combined_1_5_X2, keybuf2, tmp); xor32(tag, tmp + 32); } /* * The tag goes at the end of the ciphertext. */ memcpy(c + len, tag, sizeof tag); *clen = len + sizeof tag; return 0; } int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const unsigned char *k) { uint32_t keybuf[32], keybuf2[32]; uint8_t nonce[32], tag[32], received_tag[32], tmp[64]; uint8_t *buf; uint32_t ctr; size_t u, v, len, rlen; unsigned tcc; int i; /* * In this implementation, we limit the input length to less * than 2^32-3 blocks (i.e. about 137.4 gigabytes), which allows * us to keep the block counter on a single 32-bit integer. */ if ((clen >> 5) >= 0xFFFFFFFE) { return -2; } len = (size_t)clen; if (len < sizeof tag) { return -1; } len -= sizeof tag; /* * Pad the nonce into a 32-byte block. */ (void)nsec; memcpy(nonce, npub, 16); nonce[16] = 0x80; memset(nonce + 17, 0, 15); /* * Expand the key. */ saturnin_x2_key_expand_even(keybuf, (const uint8_t *)k); /* * Process the AAD. */ do_cascade_aad(tag, keybuf, nonce, (const uint8_t *)ad, (size_t)adlen); /* * Move the ciphertext to the plaintext (in-place decryption). We * must first copy the tag to a safe place, in case of overlap. */ memcpy(received_tag, c + len, 32); memmove(m, c, len); buf = (uint8_t *)m; /* * Do CTR+Cascade. At each iteration, we encrypt the current counter * and the ciphertext block. The main loop processes only full * blocks. Upon loop entry, the current Cascade state is expected * in tmp[32..63]. */ memcpy(tmp + 32, tag, 32); for (u = 0; (u + 31) < len; u += 32) { memcpy(tmp, nonce, 28); ctr = (u >> 5) + 1; tmp[28] = (uint8_t)(ctr >> 24); tmp[29] = (uint8_t)(ctr >> 16); tmp[30] = (uint8_t)(ctr >> 8); tmp[31] = (uint8_t)ctr; saturnin_x2_key_expand_even(keybuf2, tmp + 32); for (i = 0; i < 32; i ++) { keybuf2[i] = (keybuf2[i] << 1) | keybuf[i]; } memcpy(tmp + 32, buf + u, 32); saturnin_x2_block_encrypt(10, RC_10_combined_1_4_X2, keybuf2, tmp); xor32(tmp + 32, buf + u); xor32(buf + u, tmp); } memcpy(tmp, nonce, 28); ctr = (u >> 5) + 1; tmp[28] = (uint8_t)(ctr >> 24); tmp[29] = (uint8_t)(ctr >> 16); tmp[30] = (uint8_t)(ctr >> 8); tmp[31] = (uint8_t)ctr; saturnin_x2_key_expand_even(keybuf2, tmp + 32); for (i = 0; i < 32; i ++) { keybuf2[i] = (keybuf2[i] << 1) | keybuf[i]; } rlen = len - u; if (rlen > 0) { memcpy(tmp + 32, buf + u, rlen); } tmp[32 + rlen] = 0x80; memset(tmp + 32 + rlen + 1, 0, 31 - rlen); memcpy(tag, tmp + 32, 32); saturnin_x2_block_encrypt(10, RC_10_combined_1_5_X2, keybuf2, tmp); for (v = 0; v < rlen; v ++) { buf[u + v] ^= tmp[v]; } xor32(tag, tmp + 32); *mlen = len; /* * Compare the computed tag with the received value. If they * match, tcc will be 0; otherwise, tcc will be 1. */ tcc = 0; for (u = 0; u < sizeof tag; u ++) { tcc |= tag[u] ^ received_tag[u]; } tcc = (tcc + 0xFF) >> 8; /* * Returned value is 0 on success (tags match), -1 on error. */ return -(int)tcc; }