Commit 86ea9196 by Enrico Pozzobon

Merge branch 'master' into patched-for-tester

parents 8931c307 cad26506
...@@ -4,8 +4,11 @@ Sebastien Riou, May 27th 2020 ...@@ -4,8 +4,11 @@ Sebastien Riou, May 27th 2020
Implementation optimized for ARM-Cortex-M0 (Size and Speed) Implementation optimized for ARM-Cortex-M0 (Size and Speed)
*/ */
//define __DRYGASCON_ARM_SELECTOR_V6M__ or add drygascon128_arm_selector.h to includes
#ifndef __DRYGASCON_ARM_SELECTOR_V6M__
#include "drygascon128_arm_selector.h" #include "drygascon128_arm_selector.h"
#endif
#if defined(__DRYGASCON_ARM_SELECTOR_V6M__) #if defined(__DRYGASCON_ARM_SELECTOR_V6M__)
.cpu cortex-m0 .cpu cortex-m0
.syntax unified .syntax unified
......
...@@ -3,41 +3,73 @@ ...@@ -3,41 +3,73 @@
//Optional file to select the best implementation for each chip //Optional file to select the best implementation for each chip
#ifdef STM32H743xx #ifdef STM32H743xx
#define __DRYGASCON_ARM_SELECTOR_V7M__ #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FPU__ #define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif #endif
#ifdef STM32F746xx #ifdef STM32F746xx
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32F411xx
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32L552xx //technically it is V8M but we don't have a specific code for that one
#define __DRYGASCON_ARM_SELECTOR_V7M__ #define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FPU__ #define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif #endif
#ifdef STM32F103xx #ifdef STM32F103xx
#define __DRYGASCON_ARM_SELECTOR_V7M__ #define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif #endif
#ifdef STM32L011xx #ifdef STM32L011xx
#define __DRYGASCON_ARM_SELECTOR_V6M__ #define __DRYGASCON_ARM_SELECTOR_V6M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif #endif
#ifdef __SAM3X8E__ #ifdef __SAM3X8E__
#define __DRYGASCON_ARM_SELECTOR_V7M__ #define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif #endif
//TODO: add more chips here //TODO: add more chips here
#ifdef __DRYGASCON_ARM_SELECTOR_V7M__ #ifndef __DRYGASCON_ARM_SELECTOR_FOUND__
#ifdef __DRYGASCON_ARM_SELECTOR_FPU__ //more generic defines catching whole families
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu #if defined(STM32F4xx) || defined(STM32F7xx) || defined(STM32H7xx)
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu #define __DRYGASCON_ARM_SELECTOR_FOUND__
#else #endif
#define DRYGASCON_G_OPT drygascon128_g_v7m
#define DRYGASCON_F_OPT drygascon128_f_v7m #if defined(STM32F1xx)
#define DRYGASCON_G0_OPT drygascon128_g0_v7m #define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif #endif
#endif #endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu_x
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu_x
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu_x
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
#define DRYGASCON_G_OPT drygascon128_g_v7m
#define DRYGASCON_F_OPT drygascon128_f_v7m
#define DRYGASCON_G0_OPT drygascon128_g0_v7m
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V6M__ #ifdef __DRYGASCON_ARM_SELECTOR_V6M__
#define DRYGASCON_G_OPT drygascon128_g_v6m #define DRYGASCON_G_OPT drygascon128_g_v6m
#define DRYGASCON_F_OPT drygascon128_f_v6m #define DRYGASCON_F_OPT drygascon128_f_v6m
......
...@@ -245,7 +245,7 @@ typedef union ...@@ -245,7 +245,7 @@ typedef union
*/ */
typedef struct typedef struct
{ {
gascon128_state_t c; /**< GASCON-128 state for the capacity */ gascon128_state_t c; /**< GASCON-128 state for the capacity */
uint32_t domain; /**< Domain value to mix on next F call */ uint32_t domain; /**< Domain value to mix on next F call */
uint32_t rounds; /**< Number of rounds for next G call */ uint32_t rounds; /**< Number of rounds for next G call */
drysponge128_rate_t r; /**< Buffer for a rate block of data */ drysponge128_rate_t r; /**< Buffer for a rate block of data */
......
...@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, ...@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3); for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
\ No newline at end of file
...@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { ...@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
} }
/****************************************************************************** /******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations. * XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation. * The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/ ******************************************************************************/
...@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test; u32 test;
u32 tk1[4], tmp[4]; u32 tk1[4], tmp[4];
packing(tk1, key); packing(tk1, key);
memcpy(tmp, tk, 16); tmp[0] = tk[0] ^ tk1[0];
tmp[0] ^= tk1[0]; tmp[1] = tk[1] ^ tk1[1];
tmp[1] ^= tk1[1]; tmp[2] = tk[2] ^ tk1[2];
tmp[2] ^= tk1[2]; tmp[3] = tk[3] ^ tk1[3];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) { for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16); tmp[0] = tk[i*4+4] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test) if (test)
permute_tk_2(tmp); // applies P^2 permute_tk_2(tmp); // applies P^2
else else
...@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16); tmp[0] = tk[i*4+12] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test) if (test)
permute_tk_4(tmp); // applies P^4 permute_tk_4(tmp); // applies P^4
else else
...@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16); tmp[0] = tk[i*4+20] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test) if (test)
permute_tk_6(tmp); // applies P^6 permute_tk_6(tmp); // applies P^6
else else
...@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16); tmp[0] = tk[i*4+28] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test) if (test)
permute_tk_8(tmp); // applies P^8 permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) { for(int j = 0; j < 4; j++) {
...@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/ ******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
......
...@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2 ...@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3); for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
\ No newline at end of file
...@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { ...@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
} }
/****************************************************************************** /******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations. * XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation. * The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/ ******************************************************************************/
...@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test; u32 test;
u32 tk1[4], tmp[4]; u32 tk1[4], tmp[4];
packing(tk1, key); packing(tk1, key);
memcpy(tmp, tk, 16); tmp[0] = tk[0] ^ tk1[0];
tmp[0] ^= tk1[0]; tmp[1] = tk[1] ^ tk1[1];
tmp[1] ^= tk1[1]; tmp[2] = tk[2] ^ tk1[2];
tmp[2] ^= tk1[2]; tmp[3] = tk[3] ^ tk1[3];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) { for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16); tmp[0] = tk[i*4+4] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test) if (test)
permute_tk_2(tmp); // applies P^2 permute_tk_2(tmp); // applies P^2
else else
...@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16); tmp[0] = tk[i*4+12] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test) if (test)
permute_tk_4(tmp); // applies P^4 permute_tk_4(tmp); // applies P^4
else else
...@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16); tmp[0] = tk[i*4+20] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test) if (test)
permute_tk_6(tmp); // applies P^6 permute_tk_6(tmp); // applies P^6
else else
...@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16); tmp[0] = tk[i*4+28] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test) if (test)
permute_tk_8(tmp); // applies P^8 permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) { for(int j = 0; j < 4; j++) {
...@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/ ******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
...@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { ...@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
void precompute_rtk1(u32* rtk1, const u8* tk1) { void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16); memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16); permute_tk(rtk1, tk1, 16);
} }
\ No newline at end of file
...@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, ...@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3); for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
\ No newline at end of file
...@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { ...@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
} }
/****************************************************************************** /******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations. * XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation. * The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/ ******************************************************************************/
...@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test; u32 test;
u32 tk1[4], tmp[4]; u32 tk1[4], tmp[4];
packing(tk1, key); packing(tk1, key);
memcpy(tmp, tk, 16); tmp[0] = tk[0] ^ tk1[0];
tmp[0] ^= tk1[0]; tmp[1] = tk[1] ^ tk1[1];
tmp[1] ^= tk1[1]; tmp[2] = tk[2] ^ tk1[2];
tmp[2] ^= tk1[2]; tmp[3] = tk[3] ^ tk1[3];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) { for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16); tmp[0] = tk[i*4+4] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test) if (test)
permute_tk_2(tmp); // applies P^2 permute_tk_2(tmp); // applies P^2
else else
...@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16); tmp[0] = tk[i*4+12] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test) if (test)
permute_tk_4(tmp); // applies P^4 permute_tk_4(tmp); // applies P^4
else else
...@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16); tmp[0] = tk[i*4+20] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test) if (test)
permute_tk_6(tmp); // applies P^6 permute_tk_6(tmp); // applies P^6
else else
...@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16); tmp[0] = tk[i*4+28] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test) if (test)
permute_tk_8(tmp); // applies P^8 permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) { for(int j = 0; j < 4; j++) {
...@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/ ******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
......
...@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2 ...@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3); for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
\ No newline at end of file
...@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { ...@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
} }
/****************************************************************************** /******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations. * XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation. * The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/ ******************************************************************************/
...@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test; u32 test;
u32 tk1[4], tmp[4]; u32 tk1[4], tmp[4];
packing(tk1, key); packing(tk1, key);
memcpy(tmp, tk, 16); tmp[0] = tk[0] ^ tk1[0];
tmp[0] ^= tk1[0]; tmp[1] = tk[1] ^ tk1[1];
tmp[1] ^= tk1[1]; tmp[2] = tk[2] ^ tk1[2];
tmp[2] ^= tk1[2]; tmp[3] = tk[3] ^ tk1[3];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) { for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16); tmp[0] = tk[i*4+4] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test) if (test)
permute_tk_2(tmp); // applies P^2 permute_tk_2(tmp); // applies P^2
else else
...@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16); tmp[0] = tk[i*4+12] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test) if (test)
permute_tk_4(tmp); // applies P^4 permute_tk_4(tmp); // applies P^4
else else
...@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16); tmp[0] = tk[i*4+20] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test) if (test)
permute_tk_6(tmp); // applies P^6 permute_tk_6(tmp); // applies P^6
else else
...@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16); tmp[0] = tk[i*4+28] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test) if (test)
permute_tk_8(tmp); // applies P^8 permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) { for(int j = 0; j < 4; j++) {
...@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/ ******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
...@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { ...@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
void precompute_rtk1(u32* rtk1, const u8* tk1) { void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16); memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16); permute_tk(rtk1, tk1, 16);
} }
\ No newline at end of file
...@@ -8,12 +8,10 @@ ...@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h" #include "skinnyaead.h"
#include <string.h> #include <string.h>
#include <stdio.h>
/****************************************************************************** /******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array). * x ^= y where x, y are 128-bit blocks (16 bytes array).
......
...@@ -16,12 +16,9 @@ ...@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h" #include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************** /******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0 * The MixColumns computation for rounds i such that (i % 4) == 0
...@@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, ...@@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3); for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
...@@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, ...@@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); for(int i = 9; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
\ No newline at end of file
...@@ -3,9 +3,7 @@ ...@@ -3,9 +3,7 @@
#include "skinny128.h" #include "skinny128.h"
typedef unsigned char u8; typedef uint64_t u64;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16 #define TAGBYTES 16
#define KEYBYTES 16 #define KEYBYTES 16
......
...@@ -4,16 +4,11 @@ ...@@ -4,16 +4,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include <stdio.h> #include <string.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h" #include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/****************************************************************************** /******************************************************************************
* The round constants according to the new representation. * The round constants according to the new representation.
******************************************************************************/ ******************************************************************************/
...@@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { ...@@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
} }
/****************************************************************************** /******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations. * XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation. * The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/ ******************************************************************************/
...@@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test; u32 test;
u32 tk1[4], tmp[4]; u32 tk1[4], tmp[4];
packing(tk1, key); packing(tk1, key);
memcpy(tmp, tk, 16); tmp[0] = tk[0] ^ tk1[0];
tmp[0] ^= tk1[0]; tmp[1] = tk[1] ^ tk1[1];
tmp[1] ^= tk1[1]; tmp[2] = tk[2] ^ tk1[2];
tmp[2] ^= tk1[2]; tmp[3] = tk[3] ^ tk1[3];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) { for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16); tmp[0] = tk[i*4+4] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test) if (test)
permute_tk_2(tmp); // applies P^2 permute_tk_2(tmp); // applies P^2
else else
...@@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16); tmp[0] = tk[i*4+12] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test) if (test)
permute_tk_4(tmp); // applies P^4 permute_tk_4(tmp); // applies P^4
else else
...@@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16); tmp[0] = tk[i*4+20] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test) if (test)
permute_tk_6(tmp); // applies P^6 permute_tk_6(tmp); // applies P^6
else else
...@@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16); tmp[0] = tk[i*4+28] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test) if (test)
permute_tk_8(tmp); // applies P^8 permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) { for(int j = 0; j < 4; j++) {
...@@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/ ******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
......
#ifndef TK_SCHEDULE_H_ #ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_ #define TK_SCHEDULE_H_
typedef unsigned char u8; #include <stdint.h>
typedef unsigned int u32; typedef uint8_t u8;
typedef uint32_t u32;
void packing(u32* out, const u8* in); void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in); void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3); void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1); void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) #define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \ #define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \ tmp = (b ^ (a >> n)) & mask; \
......
...@@ -8,12 +8,10 @@ ...@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h" #include "skinnyaead.h"
#include <string.h> #include <string.h>
#include <stdio.h>
/****************************************************************************** /******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array). * x ^= y where x, y are 128-bit blocks (16 bytes array).
...@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u8 feedback; u8 feedback;
u8 tmp[2*BLOCKBYTES]; u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES); memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02); SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= 2*BLOCKBYTES) { while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr); UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES); precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk); skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
xor_block(auth, tmp); xor_block(auth, tmp);
...@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
adlen -= 2*BLOCKBYTES; adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES; ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr); UPDATE_LFSR(lfsr);
memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM
SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
} }
if (adlen > BLOCKBYTES) { // pad and process 2 blocs in // if (adlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
...@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_plus_encrypt(auth, c, ad, c, *tk); skinny128_384_plus_encrypt(tmp, c, ad, c, *tk);
} else { // if tag has been calculated yet } else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_plus_encrypt(auth, auth, ad, ad, *tk); skinny128_384_plus_encrypt(tmp, tmp, ad, ad, *tk);
} }
xor_block(auth, tmp);
} else if (adlen > 0) { } else if (adlen > 0) {
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad SET_DOMAIN(tmp, 0x03); // domain for padding ad
...@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_plus_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); skinny128_384_plus_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk);
} else { // if tag has been calculated yet } else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_plus_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); skinny128_384_plus_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
} }
xor_block(auth, tmp);
} }
} }
...@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, ...@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
feedback |= sum[i] ^ c[i]; // constant-time tag verification feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback; return feedback;
// ----------------- Process the associated data ----------------- // ----------------- Process the associated data -----------------
} }
\ No newline at end of file
...@@ -16,12 +16,9 @@ ...@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h" #include "skinny128.h"
#include "tk_schedule.h"
/**************************************************************************** /****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0. * The MixColumns operation for rounds i such that (i % 4) == 0.
...@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) { ...@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 0 * The inverse MixColumns operation for rounds i such that (i % 4) == 0
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_0(u32* state) { void inv_mixcolumns_0(u32* state) {
u32 tmp; u32 tmp;
...@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) { ...@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 1 * The inverse MixColumns operation for rounds i such that (i % 4) == 1
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_1(u32* state) { void inv_mixcolumns_1(u32* state) {
u32 tmp; u32 tmp;
...@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) { ...@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 2 * The inverse MixColumns operation for rounds i such that (i % 4) == 2
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_2(u32* state) { void inv_mixcolumns_2(u32* state) {
u32 tmp; u32 tmp;
...@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) { ...@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 3 * The inverse MixColumns operation for rounds i such that (i % 4) == 3
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_3(u32* state) { void inv_mixcolumns_3(u32* state) {
u32 tmp; u32 tmp;
...@@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, ...@@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk) { const u8* ptext_bis, const tweakey tk) {
u32 state[8]; u32 state[8];
packing(state, ptext, ptext_bis); packing(state, ptext, ptext_bis);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
unpacking(ctext, ctext_bis, state); unpacking(ctext, ctext_bis, state);
} }
...@@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, ...@@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const u8* ctext_bis, const tweakey tk) { const u8* ctext_bis, const tweakey tk) {
u32 state[8]; u32 state[8];
packing(state, ctext, ctext_bis); packing(state, ctext, ctext_bis);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); for(int i = 9; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
unpacking(ptext, ptext_bis, state); unpacking(ptext, ptext_bis, state);
} }
\ No newline at end of file
#ifndef SKINNY128_H_ #ifndef SKINNY128_H_
#define SKINNY128_H_ #define SKINNY128_H_
#include "tk_schedule.h" #include "tk_schedule.h"
void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
......
...@@ -3,9 +3,7 @@ ...@@ -3,9 +3,7 @@
#include "skinny128.h" #include "skinny128.h"
typedef unsigned char u8; typedef uint64_t u64;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16 #define TAGBYTES 16
#define KEYBYTES 16 #define KEYBYTES 16
......
...@@ -7,15 +7,11 @@ ...@@ -7,15 +7,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
*******************************************************************************/ *******************************************************************************/
#include <stdio.h>
#include <string.h> #include <string.h>
#include "tk_schedule.h" #include "tk_schedule.h"
typedef unsigned char u8;
typedef unsigned int u32;
/**************************************************************************** /****************************************************************************
* The round constants according to the fixsliced representation. * The round constants according to the fixsliced representation.
****************************************************************************/ ****************************************************************************/
......
#ifndef TK_SCHEDULE_BS_H_ #ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_ #define TK_SCHEDULE_BS_H_
typedef unsigned char u8; #include <stdint.h>
typedef unsigned int u32;
typedef uint8_t u8;
typedef uint32_t u32;
typedef struct { typedef struct {
u32 rtk1[8*16]; u32 rtk1[8*16];
......
/****************************************************************************** /******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1). * Constant-time implementation of SKINNY-AEAD-M1(v1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
* *
* For more details, see the paper at: https:// * For more details, see the paper at: https://
* *
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h" #include "skinnyaead.h"
#include <string.h> #include <string.h>
#include <stdio.h>
/****************************************************************************** /******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array). * x ^= y where x, y are 128-bit blocks (16 bytes array).
...@@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen, ...@@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
} }
LE_STR_64(tmp, lfsr); // lfsr for tag computation LE_STR_64(tmp, lfsr); // lfsr for tag computation
precompute_rtk1(rtk1, tmp); precompute_rtk1(rtk1, tmp);
for(int i = 0; i < 16; i++) {
printf("%08x %08x %08x %08x\n",rtk1[i*4], rtk1[i*4+1],rtk1[i*4+2],rtk1[i*4+3]);
}
for(int i = 0; i < 56; i++) {
printf("%08x %08x %08x %08x\n",rtk2_3[i*4], rtk2_3[i*4+1],rtk2_3[i*4+2],rtk2_3[i*4+3]);
}
skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag
// ----------------- Process the plaintext ----------------- // ----------------- Process the plaintext -----------------
...@@ -200,4 +190,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, ...@@ -200,4 +190,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
feedback |= sum[i] ^ c[i]; // constant-time tag verification feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback; return feedback;
// ----------------- Process the associated data ----------------- // ----------------- Process the associated data -----------------
} }
\ No newline at end of file
...@@ -16,12 +16,9 @@ ...@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h" #include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************** /******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0 * The MixColumns computation for rounds i such that (i % 4) == 0
...@@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, ...@@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3); for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
...@@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, ...@@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation packing(state, ptext); // from byte to bitsliced representation
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); for(int i = 13; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3);
unpacking(ctext, state); // from bitsliced to byte representation unpacking(ctext, state); // from bitsliced to byte representation
} }
\ No newline at end of file
...@@ -3,9 +3,7 @@ ...@@ -3,9 +3,7 @@
#include "skinny128.h" #include "skinny128.h"
typedef unsigned char u8; typedef uint64_t u64;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16 #define TAGBYTES 16
#define KEYBYTES 16 #define KEYBYTES 16
......
...@@ -4,16 +4,11 @@ ...@@ -4,16 +4,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include <stdio.h> #include <string.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h" #include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/****************************************************************************** /******************************************************************************
* The round constants according to the new representation. * The round constants according to the new representation.
******************************************************************************/ ******************************************************************************/
...@@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { ...@@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
} }
/****************************************************************************** /******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations. * XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation. * The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/ ******************************************************************************/
...@@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test; u32 test;
u32 tk1[4], tmp[4]; u32 tk1[4], tmp[4];
packing(tk1, key); packing(tk1, key);
memcpy(tmp, tk, 16); tmp[0] = tk[0] ^ tk1[0];
tmp[0] ^= tk1[0]; tmp[1] = tk[1] ^ tk1[1];
tmp[1] ^= tk1[1]; tmp[2] = tk[2] ^ tk1[2];
tmp[2] ^= tk1[2]; tmp[3] = tk[3] ^ tk1[3];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) { for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16); tmp[0] = tk[i*4+4] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test) if (test)
permute_tk_2(tmp); // applies P^2 permute_tk_2(tmp); // applies P^2
else else
...@@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16); tmp[0] = tk[i*4+12] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test) if (test)
permute_tk_4(tmp); // applies P^4 permute_tk_4(tmp); // applies P^4
else else
...@@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16); tmp[0] = tk[i*4+20] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test) if (test)
permute_tk_6(tmp); // applies P^6 permute_tk_6(tmp); // applies P^6
else else
...@@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16); tmp[0] = tk[i*4+28] ^ tk1[0];
XOR_BLOCKS(tmp, tk1); tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test) if (test)
permute_tk_8(tmp); // applies P^8 permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) { for(int j = 0; j < 4; j++) {
...@@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ...@@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/ ******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
...@@ -376,4 +441,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { ...@@ -376,4 +441,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
void precompute_rtk1(u32* rtk1, const u8* tk1) { void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16); memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16); permute_tk(rtk1, tk1, 16);
} }
\ No newline at end of file
#ifndef TK_SCHEDULE_H_ #ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_ #define TK_SCHEDULE_H_
typedef unsigned char u8; #include <stdint.h>
typedef unsigned int u32;
typedef uint8_t u8;
typedef uint32_t u32;
void packing(u32* out, const u8* in); void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in); void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3); void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1); void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) #define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \ #define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \ tmp = (b ^ (a >> n)) & mask; \
......
...@@ -8,12 +8,10 @@ ...@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h" #include "skinnyaead.h"
#include <string.h> #include <string.h>
#include <stdio.h>
/****************************************************************************** /******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array). * x ^= y where x, y are 128-bit blocks (16 bytes array).
...@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u8 feedback; u8 feedback;
u8 tmp[2*BLOCKBYTES]; u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES); memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02); SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= 2*BLOCKBYTES) { while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr); UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES); precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk); skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
xor_block(auth, tmp); xor_block(auth, tmp);
...@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
adlen -= 2*BLOCKBYTES; adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES; ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr); UPDATE_LFSR(lfsr);
memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM
SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
} }
if (adlen > BLOCKBYTES) { // pad and process 2 blocs in // if (adlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
...@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_encrypt(auth, c, ad, c, *tk); skinny128_384_encrypt(tmp, c, ad, c, *tk);
} else { // if tag has been calculated yet } else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_encrypt(auth, auth, ad, ad, *tk); skinny128_384_encrypt(tmp, tmp, ad, ad, *tk);
} }
xor_block(auth, tmp);
} else if (adlen > 0) { } else if (adlen > 0) {
LE_STR_64(tmp, lfsr); LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad SET_DOMAIN(tmp, 0x03); // domain for padding ad
...@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, ...@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); skinny128_384_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk);
} else { // if tag has been calculated yet } else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); skinny128_384_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
} }
xor_block(auth, tmp);
} }
} }
...@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, ...@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
feedback |= sum[i] ^ c[i]; // constant-time tag verification feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback; return feedback;
// ----------------- Process the associated data ----------------- // ----------------- Process the associated data -----------------
} }
\ No newline at end of file
...@@ -16,12 +16,9 @@ ...@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University, * @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg * alexandre.adomnicai@ntu.edu.sg
* *
* @date May 2020 * @date June 2020
******************************************************************************/ ******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h" #include "skinny128.h"
#include "tk_schedule.h"
/**************************************************************************** /****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0. * The MixColumns operation for rounds i such that (i % 4) == 0.
...@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) { ...@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 0 * The inverse MixColumns operation for rounds i such that (i % 4) == 0
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_0(u32* state) { void inv_mixcolumns_0(u32* state) {
u32 tmp; u32 tmp;
...@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) { ...@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 1 * The inverse MixColumns operation for rounds i such that (i % 4) == 1
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_1(u32* state) { void inv_mixcolumns_1(u32* state) {
u32 tmp; u32 tmp;
...@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) { ...@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 2 * The inverse MixColumns operation for rounds i such that (i % 4) == 2
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_2(u32* state) { void inv_mixcolumns_2(u32* state) {
u32 tmp; u32 tmp;
...@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) { ...@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
} }
/**************************************************************************** /****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 3 * The inverse MixColumns operation for rounds i such that (i % 4) == 3
****************************************************************************/ ****************************************************************************/
void inv_mixcolumns_3(u32* state) { void inv_mixcolumns_3(u32* state) {
u32 tmp; u32 tmp;
...@@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, ...@@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk) { const u8* ptext_bis, const tweakey tk) {
u32 state[8]; u32 state[8];
packing(state, ptext, ptext_bis); packing(state, ptext, ptext_bis);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416);
unpacking(ctext, ctext_bis, state); unpacking(ctext, ctext_bis, state);
} }
...@@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, ...@@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const u8* ctext_bis, const tweakey tk) { const u8* ctext_bis, const tweakey tk) {
u32 state[8]; u32 state[8];
packing(state, ctext, ctext_bis); packing(state, ctext, ctext_bis);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416); for(int i = 13; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384); INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
unpacking(ptext, ptext_bis, state); unpacking(ptext, ptext_bis, state);
} }
\ No newline at end of file
...@@ -3,9 +3,7 @@ ...@@ -3,9 +3,7 @@
#include "skinny128.h" #include "skinny128.h"
typedef unsigned char u8; typedef uint64_t u64;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16 #define TAGBYTES 16
#define KEYBYTES 16 #define KEYBYTES 16
......
...@@ -9,13 +9,9 @@ ...@@ -9,13 +9,9 @@
* *
* @date May 2020 * @date May 2020
*******************************************************************************/ *******************************************************************************/
#include <stdio.h>
#include <string.h> #include <string.h>
#include "tk_schedule.h" #include "tk_schedule.h"
typedef unsigned char u8;
typedef unsigned int u32;
/**************************************************************************** /****************************************************************************
* The round constants according to the fixsliced representation. * The round constants according to the fixsliced representation.
****************************************************************************/ ****************************************************************************/
......
#ifndef TK_SCHEDULE_BS_H_ #ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_ #define TK_SCHEDULE_BS_H_
typedef unsigned char u8; #include <stdint.h>
typedef unsigned int u32;
typedef uint8_t u8;
typedef uint32_t u32;
typedef struct { typedef struct {
u32 rtk1[8*16]; u32 rtk1[8*16];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment