diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c index 4a1b26e..fbe0318 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c @@ -8,12 +8,10 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c index 2082889..d37c68f 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /****************************************************************************** * The MixColumns computation for rounds i such that (i % 4) == 0 @@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } @@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3); + for(int i = 9; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c index 1da4277..7a1111b 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c @@ -4,16 +4,11 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include //for memcmp -#include "tk_schedule.h" +#include #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; - /****************************************************************************** * The round constants according to the new representation. ******************************************************************************/ @@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h index 5615cbd..d5acc39 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h @@ -1,22 +1,16 @@ #ifndef TK_SCHEDULE_H_ #define TK_SCHEDULE_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include +typedef uint8_t u8; +typedef uint32_t u32; void packing(u32* out, const u8* in); void unpacking(u8* out, u32 *in); void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3); void precompute_rtk1(u32* rtk1, const u8* tk1); -#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) - -#define XOR_BLOCKS(x,y) ({ \ - (x)[0] ^= (y)[0]; \ - (x)[1] ^= (y)[1]; \ - (x)[2] ^= (y)[2]; \ - (x)[3] ^= (y)[3]; \ -}) +#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) #define SWAPMOVE(a, b, mask, n) ({ \ tmp = (b ^ (a >> n)) & mask; \ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c index fa46817..edf906c 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c @@ -8,12 +8,10 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). @@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, u8 feedback; u8 tmp[2*BLOCKBYTES]; memset(tmp, 0x00, 2*BLOCKBYTES); - memset(auth, 0x00, BLOCKBYTES); SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); + memset(auth, 0x00, BLOCKBYTES); while (adlen >= 2*BLOCKBYTES) { LE_STR_64(tmp, lfsr); UPDATE_LFSR(lfsr); LE_STR_64(tmp + BLOCKBYTES, lfsr); - SET_DOMAIN(tmp + BLOCKBYTES, 0x02); precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES); skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk); xor_block(auth, tmp); @@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, adlen -= 2*BLOCKBYTES; ad += 2*BLOCKBYTES; UPDATE_LFSR(lfsr); + memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM + SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); } if (adlen > BLOCKBYTES) { // pad and process 2 blocs in // LE_STR_64(tmp, lfsr); @@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, LE_STR_64(tmp, lfsr); if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_plus_encrypt(auth, c, ad, c, *tk); + skinny128_384_plus_encrypt(tmp, c, ad, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_plus_encrypt(auth, auth, ad, ad, *tk); + skinny128_384_plus_encrypt(tmp, tmp, ad, ad, *tk); } + xor_block(auth, tmp); } else if (adlen > 0) { LE_STR_64(tmp, lfsr); SET_DOMAIN(tmp, 0x03); // domain for padding ad @@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, tmp[BLOCKBYTES + adlen] ^= 0x80; // padding if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_plus_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); + skinny128_384_plus_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_plus_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); + skinny128_384_plus_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); } + xor_block(auth, tmp); } } @@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, feedback |= sum[i] ^ c[i]; // constant-time tag verification return feedback; // ----------------- Process the associated data ----------------- -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c index ed1e619..01d9f61 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /**************************************************************************** * The MixColumns operation for rounds i such that (i % 4) == 0. @@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 0 +* The inverse MixColumns operation for rounds i such that (i % 4) == 0 ****************************************************************************/ void inv_mixcolumns_0(u32* state) { u32 tmp; @@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 1 +* The inverse MixColumns operation for rounds i such that (i % 4) == 1 ****************************************************************************/ void inv_mixcolumns_1(u32* state) { u32 tmp; @@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 2 +* The inverse MixColumns operation for rounds i such that (i % 4) == 2 ****************************************************************************/ void inv_mixcolumns_2(u32* state) { u32 tmp; @@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 3 +* The inverse MixColumns operation for rounds i such that (i % 4) == 3 ****************************************************************************/ void inv_mixcolumns_3(u32* state) { u32 tmp; @@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const tweakey tk) { u32 state[8]; packing(state, ptext, ptext_bis); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ctext, ctext_bis, state); } @@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const tweakey tk) { u32 state[8]; packing(state, ctext, ctext_bis); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); + for(int i = 9; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ptext, ptext_bis, state); -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h index 01dd271..3be91ec 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h @@ -1,5 +1,6 @@ #ifndef SKINNY128_H_ #define SKINNY128_H_ + #include "tk_schedule.h" void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c index ae7a820..7dbe1c3 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c @@ -7,15 +7,11 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 *******************************************************************************/ -#include #include #include "tk_schedule.h" -typedef unsigned char u8; -typedef unsigned int u32; - /**************************************************************************** * The round constants according to the fixsliced representation. ****************************************************************************/ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h index 29a2ddb..3779f90 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h @@ -1,8 +1,10 @@ #ifndef TK_SCHEDULE_BS_H_ #define TK_SCHEDULE_BS_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include + +typedef uint8_t u8; +typedef uint32_t u32; typedef struct { u32 rtk1[8*16]; diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c index 838c830..fc9af45 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c @@ -1,19 +1,15 @@ /****************************************************************************** -* Constant-time implementation of SKINNY-AEAD-M1 (v1.1). -* -* Two blocks are treated in parallel with SKINNY-128-384 whenever possible. +* Constant-time implementation of SKINNY-AEAD-M1(v1). * * For more details, see the paper at: https:// * * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). @@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen, } LE_STR_64(tmp, lfsr); // lfsr for tag computation precompute_rtk1(rtk1, tmp); - for(int i = 0; i < 16; i++) { - printf("%08x %08x %08x %08x\n",rtk1[i*4], rtk1[i*4+1],rtk1[i*4+2],rtk1[i*4+3]); - } - for(int i = 0; i < 56; i++) { - printf("%08x %08x %08x %08x\n",rtk2_3[i*4], rtk2_3[i*4+1],rtk2_3[i*4+2],rtk2_3[i*4+3]); - } skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag // ----------------- Process the plaintext ----------------- @@ -200,4 +190,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, feedback |= sum[i] ^ c[i]; // constant-time tag verification return feedback; // ----------------- Process the associated data ----------------- -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c index e6177a2..f0e11c9 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /****************************************************************************** * The MixColumns computation for rounds i such that (i % 4) == 0 @@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } @@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3); + for(int i = 13; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c index c818cf2..b09a0b2 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c @@ -4,16 +4,11 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include //for memcmp -#include "tk_schedule.h" +#include #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; - /****************************************************************************** * The round constants according to the new representation. ******************************************************************************/ @@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) @@ -376,4 +441,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk1(u32* rtk1, const u8* tk1) { memset(rtk1, 0x00, 16*16); permute_tk(rtk1, tk1, 16); -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h index 5615cbd..81dcbef 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h @@ -1,22 +1,17 @@ #ifndef TK_SCHEDULE_H_ #define TK_SCHEDULE_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include + +typedef uint8_t u8; +typedef uint32_t u32; void packing(u32* out, const u8* in); void unpacking(u8* out, u32 *in); void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3); void precompute_rtk1(u32* rtk1, const u8* tk1); -#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) - -#define XOR_BLOCKS(x,y) ({ \ - (x)[0] ^= (y)[0]; \ - (x)[1] ^= (y)[1]; \ - (x)[2] ^= (y)[2]; \ - (x)[3] ^= (y)[3]; \ -}) +#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) #define SWAPMOVE(a, b, mask, n) ({ \ tmp = (b ^ (a >> n)) & mask; \ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c index 640910e..61cf8b0 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c @@ -8,12 +8,10 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). @@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, u8 feedback; u8 tmp[2*BLOCKBYTES]; memset(tmp, 0x00, 2*BLOCKBYTES); - memset(auth, 0x00, BLOCKBYTES); SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); + memset(auth, 0x00, BLOCKBYTES); while (adlen >= 2*BLOCKBYTES) { LE_STR_64(tmp, lfsr); UPDATE_LFSR(lfsr); LE_STR_64(tmp + BLOCKBYTES, lfsr); - SET_DOMAIN(tmp + BLOCKBYTES, 0x02); precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES); skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk); xor_block(auth, tmp); @@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, adlen -= 2*BLOCKBYTES; ad += 2*BLOCKBYTES; UPDATE_LFSR(lfsr); + memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM + SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); } if (adlen > BLOCKBYTES) { // pad and process 2 blocs in // LE_STR_64(tmp, lfsr); @@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, LE_STR_64(tmp, lfsr); if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_encrypt(auth, c, ad, c, *tk); + skinny128_384_encrypt(tmp, c, ad, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_encrypt(auth, auth, ad, ad, *tk); + skinny128_384_encrypt(tmp, tmp, ad, ad, *tk); } + xor_block(auth, tmp); } else if (adlen > 0) { LE_STR_64(tmp, lfsr); SET_DOMAIN(tmp, 0x03); // domain for padding ad @@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, tmp[BLOCKBYTES + adlen] ^= 0x80; // padding if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); + skinny128_384_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); + skinny128_384_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); } + xor_block(auth, tmp); } } @@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, feedback |= sum[i] ^ c[i]; // constant-time tag verification return feedback; // ----------------- Process the associated data ----------------- -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c index 2e1e9c3..304b899 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /**************************************************************************** * The MixColumns operation for rounds i such that (i % 4) == 0. @@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 0 +* The inverse MixColumns operation for rounds i such that (i % 4) == 0 ****************************************************************************/ void inv_mixcolumns_0(u32* state) { u32 tmp; @@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 1 +* The inverse MixColumns operation for rounds i such that (i % 4) == 1 ****************************************************************************/ void inv_mixcolumns_1(u32* state) { u32 tmp; @@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 2 +* The inverse MixColumns operation for rounds i such that (i % 4) == 2 ****************************************************************************/ void inv_mixcolumns_2(u32* state) { u32 tmp; @@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 3 +* The inverse MixColumns operation for rounds i such that (i % 4) == 3 ****************************************************************************/ void inv_mixcolumns_3(u32* state) { u32 tmp; @@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const tweakey tk) { u32 state[8]; packing(state, ptext, ptext_bis); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ctext, ctext_bis, state); } @@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const tweakey tk) { u32 state[8]; packing(state, ctext, ctext_bis); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); + for(int i = 13; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ptext, ptext_bis, state); } \ No newline at end of file diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c index 3897777..528d0eb 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c @@ -9,13 +9,9 @@ * * @date May 2020 *******************************************************************************/ -#include #include #include "tk_schedule.h" -typedef unsigned char u8; -typedef unsigned int u32; - /**************************************************************************** * The round constants according to the fixsliced representation. ****************************************************************************/ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h index c6d03ce..7b17342 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h @@ -1,8 +1,10 @@ #ifndef TK_SCHEDULE_BS_H_ #define TK_SCHEDULE_BS_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include + +typedef uint8_t u8; +typedef uint32_t u32; typedef struct { u32 rtk1[8*16];