From a6544c4fc2a5d180b6d12e7b28ac8432194a3f41 Mon Sep 17 00:00:00 2001 From: Alexandre Adomnicai Date: Tue, 2 Jun 2020 11:09:34 +0000 Subject: [PATCH] romulus opt32 --- romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c | 12 ++---------- romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------- romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c | 16 ++-------------- romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c | 12 ++---------- romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------- romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c | 16 ++-------------- romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- 8 files changed, 350 insertions(+), 110 deletions(-) diff --git a/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c index a1061d5..5902d16 100644 --- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c @@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c index 1da4277..5a20b37 100644 --- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c @@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) diff --git a/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c index 87718e3..326d617 100644 --- a/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c @@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2 u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c index c818cf2..0c364f3 100644 --- a/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c @@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) @@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk1(u32* rtk1, const u8* tk1) { memset(rtk1, 0x00, 16*16); permute_tk(rtk1, tk1, 16); -} \ No newline at end of file +} diff --git a/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c index a1061d5..5902d16 100644 --- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c @@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c index 1da4277..5a20b37 100644 --- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c @@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) diff --git a/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c index 87718e3..326d617 100644 --- a/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c @@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2 u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c index c818cf2..0c364f3 100644 --- a/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c @@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) @@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk1(u32* rtk1, const u8* tk1) { memset(rtk1, 0x00, 16*16); permute_tk(rtk1, tk1, 16); -} \ No newline at end of file +} -- libgit2 0.26.0