/* * Copyright (C) 2021 Southern Storm Software, Pty Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include "internal-keccakp-200.h" /* Determine if Keccak-p[200] should be accelerated with assembly code */ #if defined(__AVR__) #define KECCAKP_200_ASM 1 #elif defined(__ARM_ARCH_ISA_THUMB) && __ARM_ARCH == 7 #define KECCAKP_200_ASM 1 #else #define KECCAKP_200_ASM 0 #endif #if !KECCAKP_200_ASM #if !defined(LW_UTIL_CPU_IS_64BIT) /* Define to 1 to select the optimised 32-bit version of Keccak-p[200] */ #define KECCAKP_200_OPT32 1 /* Define to 1 to select the optimised 64-bit version of Keccak-p[200] */ #define KECCAKP_200_OPT64 0 #else #define KECCAKP_200_OPT32 0 #define KECCAKP_200_OPT64 1 #endif #if KECCAKP_200_OPT32 /* * Optimized version for 32-bit platforms, inspired by the ARMv7 code at: * * https://github.com/XKCP/XKCP/blob/master/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s * * Multiple 8-bit lanes are loaded up into 32-bit registers and processed * in parallel. */ void keccakp_200_permute(keccakp_200_state_t *state) { static uint8_t const RC[18] = { 0x01, 0x82, 0x8A, 0x00, 0x8B, 0x01, 0x81, 0x09, 0x8A, 0x88, 0x09, 0x0A, 0x8B, 0x8B, 0x89, 0x03, 0x02, 0x80 }; uint32_t r0_l, r0_r; /* Left and right halves of row 0 */ uint32_t r1_l, r1_r; /* Left and right halves of row 1 */ uint32_t r2_l, r2_r; /* Left and right halves of row 2 */ uint32_t r3_l, r3_r; /* Left and right halves of row 3 */ uint32_t r4_l, r4_r; /* Left and right halves of row 4 */ uint32_t C_l, C_r; unsigned round; /* Load the state into the row vectors */ r0_l = le_load_word32(&(state->A[0][0])); r1_l = le_load_word32(&(state->A[1][0])); r2_l = le_load_word32(&(state->A[2][0])); r3_l = le_load_word32(&(state->A[3][0])); r4_l = le_load_word32(&(state->A[4][0])); r0_r = state->A[0][4]; r1_r = state->A[1][4]; r2_r = state->A[2][4]; r3_r = state->A[3][4]; r4_r = state->A[4][4]; /* Perform all rounds */ for (round = 0; round < 18; ++round) { /* Step mapping theta */ /* * C[index] = state->A[0][index] ^ state->A[1][index] ^ * state->A[2][index] ^ state->A[3][index] ^ * state->A[4][index]; */ C_l = r0_l ^ r1_l ^ r2_l ^ r3_l ^ r4_l; C_r = r0_r ^ r1_r ^ r2_r ^ r3_r ^ r4_r; /* * D = C[(index + 4) % 5] ^ leftRotate1_8(C[(index + 1) % 5]) */ C_r = (((C_l & 0x7F7F7F7FUL) >> 7) | ((C_l & 0x80808080UL) >> 15) | ((C_r & 0x7FUL) << 25) | ((C_r & 0x80UL) << 17)) ^ ((C_l << 8) | C_r); C_l = (((C_l & 0x7FL) << 1) | ((C_l & 0x80UL) >> 7)) ^ (C_l >> 24); /* * Apply D to all rows. The left word of D is in the right word of C. */ r0_l ^= C_r; r1_l ^= C_r; r2_l ^= C_r; r3_l ^= C_r; r4_l ^= C_r; r0_r ^= C_l; r1_r ^= C_l; r2_r ^= C_l; r3_r ^= C_l; r4_r ^= C_l; /* Step mapping rho and pi combined into a single step. * Rotate all lanes by a specific offset and rearrange */ #define MASK_OFF(col) (~(0xFFUL << (((col) & 3) * 8))) #define RHO_PI(dest, destcol, src, srccol, rot) \ do { \ C_l = ((src) >> (((srccol) & 3) * 8)) & 0xFFUL; \ C_l = ((C_l << (rot)) | (C_l >> (8 - (rot)))) & 0xFFUL; \ (dest) = ((dest) & MASK_OFF((destcol))) | \ (C_l << (((destcol) & 3) * 8)); \ } while (0) #define RHO_PI_COPY(dest, destcol, src, srccol) \ do { \ C_l = ((src) >> (((srccol) & 3) * 8)) & 0xFFUL; \ (dest) = ((dest) & MASK_OFF((destcol))) | \ (C_l << (((destcol) & 3) * 8)); \ } while (0) /* D = state->A[0][1]; */ C_r = (r0_l >> 8) & 0xFFUL; /* state->A[0][1] = leftRotate4_8(state->A[1][1]); */ RHO_PI(r0_l, 1, r1_l, 1, 4); /* state->A[1][1] = leftRotate4_8(state->A[1][4]); */ RHO_PI(r1_l, 1, r1_r, 4, 4); /* state->A[1][4] = leftRotate5_8(state->A[4][2]); */ RHO_PI(r1_r, 4, r4_l, 2, 5); /* state->A[4][2] = leftRotate7_8(state->A[2][4]); */ RHO_PI(r4_l, 2, r2_r, 4, 7); /* state->A[2][4] = leftRotate2_8(state->A[4][0]); */ RHO_PI(r2_r, 4, r4_l, 0, 2); /* state->A[4][0] = leftRotate6_8(state->A[0][2]); */ RHO_PI(r4_l, 0, r0_l, 2, 6); /* state->A[0][2] = leftRotate3_8(state->A[2][2]); */ RHO_PI(r0_l, 2, r2_l, 2, 3); /* state->A[2][2] = leftRotate1_8(state->A[2][3]); */ RHO_PI(r2_l, 2, r2_l, 3, 1); /* state->A[2][3] = state->A[3][4]; */ RHO_PI_COPY(r2_l, 3, r3_r, 4); /* state->A[3][4] = state->A[4][3]; */ RHO_PI_COPY(r3_r, 4, r4_l, 3); /* state->A[4][3] = leftRotate1_8(state->A[3][0]); */ RHO_PI(r4_l, 3, r3_l, 0, 1); /* state->A[3][0] = leftRotate3_8(state->A[0][4]); */ RHO_PI(r3_l, 0, r0_r, 4, 3); /* state->A[0][4] = leftRotate6_8(state->A[4][4]); */ RHO_PI(r0_r, 4, r4_r, 4, 6); /* state->A[4][4] = leftRotate2_8(state->A[4][1]); */ RHO_PI(r4_r, 4, r4_l, 1, 2); /* state->A[4][1] = leftRotate7_8(state->A[1][3]); */ RHO_PI(r4_l, 1, r1_l, 3, 7); /* state->A[1][3] = leftRotate5_8(state->A[3][1]); */ RHO_PI(r1_l, 3, r3_l, 1, 5); /* state->A[3][1] = leftRotate4_8(state->A[1][0]); */ RHO_PI(r3_l, 1, r1_l, 0, 4); /* state->A[1][0] = leftRotate4_8(state->A[0][3]); */ RHO_PI(r1_l, 0, r0_l, 3, 4); /* state->A[0][3] = leftRotate5_8(state->A[3][3]); */ RHO_PI(r0_l, 3, r3_l, 3, 5); /* state->A[3][3] = leftRotate7_8(state->A[3][2]); */ RHO_PI(r3_l, 3, r3_l, 2, 7); /* state->A[3][2] = leftRotate2_8(state->A[2][1]); */ RHO_PI(r3_l, 2, r2_l, 1, 2); /* state->A[2][1] = leftRotate6_8(state->A[1][2]); */ RHO_PI(r2_l, 1, r1_l, 2, 6); /* state->A[1][2] = leftRotate3_8(state->A[2][0]); */ RHO_PI(r1_l, 2, r2_l, 0, 3); /* state->A[2][0] = leftRotate1_8(D); */ r2_l = (r2_l & 0xFFFFFF00UL) | (((C_r << 1) | (C_r >> 7)) & 0xFFUL); /* Step mapping chi. Combine each lane with two others in its row */ /* * for (index = 0; index < 5; ++index) { * C[0] = state->A[index][0]; * C[1] = state->A[index][1]; * C[2] = state->A[index][2]; * C[3] = state->A[index][3]; * C[4] = state->A[index][4]; * for (index2 = 0; index2 < 5; ++index2) { * state->A[index][index2] = * C[index2] ^ * ((~C[addMod5(index2, 1)]) & C[addMod5(index2, 2)]); * } * } */ #define CHI(rl, rr) \ do { \ C_l = (~(((rl) >> 8) | ((rr) << 24))) & \ (((rl) >> 16) | ((rl) << 24) | ((rr) << 16)); \ C_r = ((~(rl)) & ((rl) >> 8)) & 0xFFUL; \ (rl) ^= C_l; \ (rr) ^= C_r; \ } while (0) CHI(r0_l, r0_r); CHI(r1_l, r1_r); CHI(r2_l, r2_r); CHI(r3_l, r3_r); CHI(r4_l, r4_r); /* Step mapping iota. XOR A[0][0] with the round constant */ r0_l ^= RC[round]; } /* Write the row vectors back to the state */ le_store_word32(&(state->A[0][0]), r0_l); le_store_word32(&(state->A[1][0]), r1_l); le_store_word32(&(state->A[2][0]), r2_l); le_store_word32(&(state->A[3][0]), r3_l); le_store_word32(&(state->A[4][0]), r4_l); state->A[0][4] = (unsigned char)r0_r; state->A[1][4] = (unsigned char)r1_r; state->A[2][4] = (unsigned char)r2_r; state->A[3][4] = (unsigned char)r3_r; state->A[4][4] = (unsigned char)r4_r; } #elif KECCAKP_200_OPT64 /* * Optimized version for 64-bit platforms, inspired by the ARMv7 code at: * * https://github.com/XKCP/XKCP/blob/master/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s * * Multiple 8-bit lanes are loaded up into 64-bit registers and processed * in parallel. */ /* Load a little-endian 40-bit word from a byte buffer */ #define le_load_word40(ptr) \ ((((uint64_t)((ptr)[4])) << 32) | \ (((uint64_t)((ptr)[3])) << 24) | \ (((uint64_t)((ptr)[2])) << 16) | \ (((uint64_t)((ptr)[1])) << 8) | \ ((uint64_t)((ptr)[0]))) /* Store a little-endian 40-bit word into a byte buffer */ #define le_store_word40(ptr, x) \ do { \ (ptr)[0] = (uint8_t)(x); \ (ptr)[1] = (uint8_t)((x) >> 8); \ (ptr)[2] = (uint8_t)((x) >> 16); \ (ptr)[3] = (uint8_t)((x) >> 24); \ (ptr)[4] = (uint8_t)((x) >> 32); \ } while (0) void keccakp_200_permute(keccakp_200_state_t *state) { static uint8_t const RC[18] = { 0x01, 0x82, 0x8A, 0x00, 0x8B, 0x01, 0x81, 0x09, 0x8A, 0x88, 0x09, 0x0A, 0x8B, 0x8B, 0x89, 0x03, 0x02, 0x80 }; uint64_t r0, r1, r2, r3, r4; uint64_t C, D; unsigned round; /* Load the state into the row vectors */ r0 = le_load_word40(&(state->A[0][0])); r1 = le_load_word40(&(state->A[1][0])); r2 = le_load_word40(&(state->A[2][0])); r3 = le_load_word40(&(state->A[3][0])); r4 = le_load_word40(&(state->A[4][0])); /* Perform all rounds */ for (round = 0; round < 18; ++round) { /* Step mapping theta */ /* * C[index] = state->A[0][index] ^ state->A[1][index] ^ * state->A[2][index] ^ state->A[3][index] ^ * state->A[4][index]; */ C = r0 ^ r1 ^ r2 ^ r3 ^ r4; /* * D = C[(index + 4) % 5] ^ leftRotate1_8(C[(index + 1) % 5]) */ D = ((C & 0x7F7F7F7F7FULL) << 1) | ((C & 0x8080808080ULL) >> 7); D = (D >> 8) | (D << 32); D ^= (C << 8) ^ (C >> 32); D &= 0xFFFFFFFFFFULL; /* * Apply D to all rows. */ r0 ^= D; r1 ^= D; r2 ^= D; r3 ^= D; r4 ^= D; /* Step mapping rho and pi combined into a single step. * Rotate all lanes by a specific offset and rearrange */ #define MASK_OFF(col) (~(0xFFULL << ((col) * 8))) #define RHO_PI(dest, destcol, src, srccol, rot) \ do { \ C = ((src) >> ((srccol) * 8)) & 0xFFULL; \ C = ((C << (rot)) | (C >> (8 - (rot)))) & 0xFFULL; \ (dest) = ((dest) & MASK_OFF((destcol))) | \ (C << ((destcol) * 8)); \ } while (0) #define RHO_PI_COPY(dest, destcol, src, srccol) \ do { \ C = ((src) >> ((srccol) * 8)) & 0xFFULL; \ (dest) = ((dest) & MASK_OFF((destcol))) | \ (C << ((destcol) * 8)); \ } while (0) /* D = state->A[0][1]; */ D = (r0 >> 8) & 0xFFULL; /* state->A[0][1] = leftRotate4_8(state->A[1][1]); */ RHO_PI(r0, 1, r1, 1, 4); /* state->A[1][1] = leftRotate4_8(state->A[1][4]); */ RHO_PI(r1, 1, r1, 4, 4); /* state->A[1][4] = leftRotate5_8(state->A[4][2]); */ RHO_PI(r1, 4, r4, 2, 5); /* state->A[4][2] = leftRotate7_8(state->A[2][4]); */ RHO_PI(r4, 2, r2, 4, 7); /* state->A[2][4] = leftRotate2_8(state->A[4][0]); */ RHO_PI(r2, 4, r4, 0, 2); /* state->A[4][0] = leftRotate6_8(state->A[0][2]); */ RHO_PI(r4, 0, r0, 2, 6); /* state->A[0][2] = leftRotate3_8(state->A[2][2]); */ RHO_PI(r0, 2, r2, 2, 3); /* state->A[2][2] = leftRotate1_8(state->A[2][3]); */ RHO_PI(r2, 2, r2, 3, 1); /* state->A[2][3] = state->A[3][4]; */ RHO_PI_COPY(r2, 3, r3, 4); /* state->A[3][4] = state->A[4][3]; */ RHO_PI_COPY(r3, 4, r4, 3); /* state->A[4][3] = leftRotate1_8(state->A[3][0]); */ RHO_PI(r4, 3, r3, 0, 1); /* state->A[3][0] = leftRotate3_8(state->A[0][4]); */ RHO_PI(r3, 0, r0, 4, 3); /* state->A[0][4] = leftRotate6_8(state->A[4][4]); */ RHO_PI(r0, 4, r4, 4, 6); /* state->A[4][4] = leftRotate2_8(state->A[4][1]); */ RHO_PI(r4, 4, r4, 1, 2); /* state->A[4][1] = leftRotate7_8(state->A[1][3]); */ RHO_PI(r4, 1, r1, 3, 7); /* state->A[1][3] = leftRotate5_8(state->A[3][1]); */ RHO_PI(r1, 3, r3, 1, 5); /* state->A[3][1] = leftRotate4_8(state->A[1][0]); */ RHO_PI(r3, 1, r1, 0, 4); /* state->A[1][0] = leftRotate4_8(state->A[0][3]); */ RHO_PI(r1, 0, r0, 3, 4); /* state->A[0][3] = leftRotate5_8(state->A[3][3]); */ RHO_PI(r0, 3, r3, 3, 5); /* state->A[3][3] = leftRotate7_8(state->A[3][2]); */ RHO_PI(r3, 3, r3, 2, 7); /* state->A[3][2] = leftRotate2_8(state->A[2][1]); */ RHO_PI(r3, 2, r2, 1, 2); /* state->A[2][1] = leftRotate6_8(state->A[1][2]); */ RHO_PI(r2, 1, r1, 2, 6); /* state->A[1][2] = leftRotate3_8(state->A[2][0]); */ RHO_PI(r1, 2, r2, 0, 3); /* state->A[2][0] = leftRotate1_8(D); */ r2 = (r2 & MASK_OFF(0)) | (((D << 1) | (D >> 7)) & 0xFFULL); /* Step mapping chi. Combine each lane with two others in its row */ /* * for (index = 0; index < 5; ++index) { * C[0] = state->A[index][0]; * C[1] = state->A[index][1]; * C[2] = state->A[index][2]; * C[3] = state->A[index][3]; * C[4] = state->A[index][4]; * for (index2 = 0; index2 < 5; ++index2) { * state->A[index][index2] = * C[index2] ^ * ((~C[addMod5(index2, 1)]) & C[addMod5(index2, 2)]); * } * } */ r0 = r0 ^ ((~((r0 >> 8) | (r0 << 32))) & ((r0 >> 16) | (r0 << 24))); r1 = r1 ^ ((~((r1 >> 8) | (r1 << 32))) & ((r1 >> 16) | (r1 << 24))); r2 = r2 ^ ((~((r2 >> 8) | (r2 << 32))) & ((r2 >> 16) | (r2 << 24))); r3 = r3 ^ ((~((r3 >> 8) | (r3 << 32))) & ((r3 >> 16) | (r3 << 24))); r4 = r4 ^ ((~((r4 >> 8) | (r4 << 32))) & ((r4 >> 16) | (r4 << 24))); /* Step mapping iota. XOR A[0][0] with the round constant * and restrict the 64-bit registers back to 40-bit values */ r0 = (r0 & 0xFFFFFFFFFFULL) ^ RC[round]; r1 &= 0xFFFFFFFFFFULL; r2 &= 0xFFFFFFFFFFULL; r3 &= 0xFFFFFFFFFFULL; r4 &= 0xFFFFFFFFFFULL; } /* Write the row vectors back to the state */ le_store_word40(&(state->A[0][0]), r0); le_store_word40(&(state->A[1][0]), r1); le_store_word40(&(state->A[2][0]), r2); le_store_word40(&(state->A[3][0]), r3); le_store_word40(&(state->A[4][0]), r4); } #else /* !KECCAKP_200_OPT32 && !KECCAKP_200_OPT64 */ /* Faster method to compute ((x + y) % 5) that avoids the division */ static unsigned char const addMod5Table[9] = { 0, 1, 2, 3, 4, 0, 1, 2, 3 }; #define addMod5(x, y) (addMod5Table[(x) + (y)]) void keccakp_200_permute(keccakp_200_state_t *state) { static uint8_t const RC[18] = { 0x01, 0x82, 0x8A, 0x00, 0x8B, 0x01, 0x81, 0x09, 0x8A, 0x88, 0x09, 0x0A, 0x8B, 0x8B, 0x89, 0x03, 0x02, 0x80 }; uint8_t C[5]; uint8_t D; unsigned round; unsigned index, index2; for (round = 0; round < 18; ++round) { /* Step mapping theta. The specification mentions two temporary * arrays of size 5 called C and D. Compute D on the fly */ for (index = 0; index < 5; ++index) { C[index] = state->A[0][index] ^ state->A[1][index] ^ state->A[2][index] ^ state->A[3][index] ^ state->A[4][index]; } for (index = 0; index < 5; ++index) { D = C[addMod5(index, 4)] ^ leftRotate1_8(C[addMod5(index, 1)]); for (index2 = 0; index2 < 5; ++index2) state->A[index2][index] ^= D; } /* Step mapping rho and pi combined into a single step. * Rotate all lanes by a specific offset and rearrange */ D = state->A[0][1]; state->A[0][1] = leftRotate4_8(state->A[1][1]); state->A[1][1] = leftRotate4_8(state->A[1][4]); state->A[1][4] = leftRotate5_8(state->A[4][2]); state->A[4][2] = leftRotate7_8(state->A[2][4]); state->A[2][4] = leftRotate2_8(state->A[4][0]); state->A[4][0] = leftRotate6_8(state->A[0][2]); state->A[0][2] = leftRotate3_8(state->A[2][2]); state->A[2][2] = leftRotate1_8(state->A[2][3]); state->A[2][3] = state->A[3][4]; state->A[3][4] = state->A[4][3]; state->A[4][3] = leftRotate1_8(state->A[3][0]); state->A[3][0] = leftRotate3_8(state->A[0][4]); state->A[0][4] = leftRotate6_8(state->A[4][4]); state->A[4][4] = leftRotate2_8(state->A[4][1]); state->A[4][1] = leftRotate7_8(state->A[1][3]); state->A[1][3] = leftRotate5_8(state->A[3][1]); state->A[3][1] = leftRotate4_8(state->A[1][0]); state->A[1][0] = leftRotate4_8(state->A[0][3]); state->A[0][3] = leftRotate5_8(state->A[3][3]); state->A[3][3] = leftRotate7_8(state->A[3][2]); state->A[3][2] = leftRotate2_8(state->A[2][1]); state->A[2][1] = leftRotate6_8(state->A[1][2]); state->A[1][2] = leftRotate3_8(state->A[2][0]); state->A[2][0] = leftRotate1_8(D); /* Step mapping chi. Combine each lane with two others in its row */ for (index = 0; index < 5; ++index) { C[0] = state->A[index][0]; C[1] = state->A[index][1]; C[2] = state->A[index][2]; C[3] = state->A[index][3]; C[4] = state->A[index][4]; for (index2 = 0; index2 < 5; ++index2) { state->A[index][index2] = C[index2] ^ ((~C[addMod5(index2, 1)]) & C[addMod5(index2, 2)]); } } /* Step mapping iota. XOR A[0][0] with the round constant */ state->A[0][0] ^= RC[round]; } } #endif /* !KECCAKP_200_OPT32 && !KECCAKP_200_OPT64 */ #endif /* !KECCAKP_200_ASM */