Commit 86ea9196 by Enrico Pozzobon

Merge branch 'master' into patched-for-tester

parents 8931c307 cad26506
......@@ -4,8 +4,11 @@ Sebastien Riou, May 27th 2020
Implementation optimized for ARM-Cortex-M0 (Size and Speed)
*/
//define __DRYGASCON_ARM_SELECTOR_V6M__ or add drygascon128_arm_selector.h to includes
#ifndef __DRYGASCON_ARM_SELECTOR_V6M__
#include "drygascon128_arm_selector.h"
#endif
#if defined(__DRYGASCON_ARM_SELECTOR_V6M__)
.cpu cortex-m0
.syntax unified
......
......@@ -3,39 +3,71 @@
//Optional file to select the best implementation for each chip
#ifdef STM32H743xx
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FPU__
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32F746xx
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32F411xx
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32L552xx //technically it is V8M but we don't have a specific code for that one
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32F103xx
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32L011xx
#define __DRYGASCON_ARM_SELECTOR_V6M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef __SAM3X8E__
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
//TODO: add more chips here
#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
#ifdef __DRYGASCON_ARM_SELECTOR_FPU__
#ifndef __DRYGASCON_ARM_SELECTOR_FOUND__
//more generic defines catching whole families
#if defined(STM32F4xx) || defined(STM32F7xx) || defined(STM32H7xx)
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#if defined(STM32F1xx)
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu
#else
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu_x
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu_x
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu_x
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
#define DRYGASCON_G_OPT drygascon128_g_v7m
#define DRYGASCON_F_OPT drygascon128_f_v7m
#define DRYGASCON_G0_OPT drygascon128_g0_v7m
#endif
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V6M__
......
......@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
......@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
......@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
......@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
......@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
......
......@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
......@@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
......@@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3);
for(int i = 9; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -3,9 +3,7 @@
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
typedef uint64_t u64;
#define TAGBYTES 16
#define KEYBYTES 16
......
......@@ -4,16 +4,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include <string.h>
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
......@@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
#include <stdint.h>
typedef uint8_t u8;
typedef uint32_t u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
......@@ -11,13 +12,6 @@ void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
......
......@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
......@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u8 feedback;
u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
xor_block(auth, tmp);
......@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM
SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
}
if (adlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
......@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_plus_encrypt(auth, c, ad, c, *tk);
skinny128_384_plus_encrypt(tmp, c, ad, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_plus_encrypt(auth, auth, ad, ad, *tk);
skinny128_384_plus_encrypt(tmp, tmp, ad, ad, *tk);
}
xor_block(auth, tmp);
} else if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
......@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_plus_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk);
skinny128_384_plus_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_plus_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
skinny128_384_plus_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
}
xor_block(auth, tmp);
}
}
......
......@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0.
......@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
****************************************************************************/
void inv_mixcolumns_0(u32* state) {
u32 tmp;
......@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
* The inverse MixColumns operation for rounds i such that (i % 4) == 1
****************************************************************************/
void inv_mixcolumns_1(u32* state) {
u32 tmp;
......@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
* The inverse MixColumns operation for rounds i such that (i % 4) == 2
****************************************************************************/
void inv_mixcolumns_2(u32* state) {
u32 tmp;
......@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
* The inverse MixColumns operation for rounds i such that (i % 4) == 3
****************************************************************************/
void inv_mixcolumns_3(u32* state) {
u32 tmp;
......@@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk) {
u32 state[8];
packing(state, ptext, ptext_bis);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
unpacking(ctext, ctext_bis, state);
}
......@@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const u8* ctext_bis, const tweakey tk) {
u32 state[8];
packing(state, ctext, ctext_bis);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
for(int i = 9; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
unpacking(ptext, ptext_bis, state);
}
#ifndef SKINNY128_H_
#define SKINNY128_H_
#include "tk_schedule.h"
void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
......
......@@ -3,9 +3,7 @@
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
typedef uint64_t u64;
#define TAGBYTES 16
#define KEYBYTES 16
......
......@@ -7,15 +7,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
*******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "tk_schedule.h"
typedef unsigned char u8;
typedef unsigned int u32;
/****************************************************************************
* The round constants according to the fixsliced representation.
****************************************************************************/
......
#ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
typedef unsigned char u8;
typedef unsigned int u32;
#include <stdint.h>
typedef uint8_t u8;
typedef uint32_t u32;
typedef struct {
u32 rtk1[8*16];
......
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
* Constant-time implementation of SKINNY-AEAD-M1(v1).
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
......@@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
precompute_rtk1(rtk1, tmp);
for(int i = 0; i < 16; i++) {
printf("%08x %08x %08x %08x\n",rtk1[i*4], rtk1[i*4+1],rtk1[i*4+2],rtk1[i*4+3]);
}
for(int i = 0; i < 56; i++) {
printf("%08x %08x %08x %08x\n",rtk2_3[i*4], rtk2_3[i*4+1],rtk2_3[i*4+2],rtk2_3[i*4+3]);
}
skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag
// ----------------- Process the plaintext -----------------
......
......@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
......@@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
......@@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3);
for(int i = 13; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
......@@ -3,9 +3,7 @@
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
typedef uint64_t u64;
#define TAGBYTES 16
#define KEYBYTES 16
......
......@@ -4,16 +4,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include <string.h>
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
......@@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
#include <stdint.h>
typedef uint8_t u8;
typedef uint32_t u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
......@@ -11,13 +13,6 @@ void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
......
......@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
......@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u8 feedback;
u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
xor_block(auth, tmp);
......@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM
SET_DOMAIN(tmp, 0x02);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
}
if (adlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
......@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_encrypt(auth, c, ad, c, *tk);
skinny128_384_encrypt(tmp, c, ad, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_encrypt(auth, auth, ad, ad, *tk);
skinny128_384_encrypt(tmp, tmp, ad, ad, *tk);
}
xor_block(auth, tmp);
} else if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
......@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk);
skinny128_384_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
skinny128_384_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
}
xor_block(auth, tmp);
}
}
......
......@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
* @date June 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0.
......@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
****************************************************************************/
void inv_mixcolumns_0(u32* state) {
u32 tmp;
......@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
* The inverse MixColumns operation for rounds i such that (i % 4) == 1
****************************************************************************/
void inv_mixcolumns_1(u32* state) {
u32 tmp;
......@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
* The inverse MixColumns operation for rounds i such that (i % 4) == 2
****************************************************************************/
void inv_mixcolumns_2(u32* state) {
u32 tmp;
......@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
* The inverse MixColumns operation for rounds i such that (i % 4) == 3
****************************************************************************/
void inv_mixcolumns_3(u32* state) {
u32 tmp;
......@@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk) {
u32 state[8];
packing(state, ptext, ptext_bis);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416);
for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
unpacking(ctext, ctext_bis, state);
}
......@@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const u8* ctext_bis, const tweakey tk) {
u32 state[8];
packing(state, ctext, ctext_bis);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
for(int i = 13; i >= 0; i--)
INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
unpacking(ptext, ptext_bis, state);
}
\ No newline at end of file
......@@ -3,9 +3,7 @@
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
typedef uint64_t u64;
#define TAGBYTES 16
#define KEYBYTES 16
......
......@@ -9,13 +9,9 @@
*
* @date May 2020
*******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "tk_schedule.h"
typedef unsigned char u8;
typedef unsigned int u32;
/****************************************************************************
* The round constants according to the fixsliced representation.
****************************************************************************/
......
#ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
typedef unsigned char u8;
typedef unsigned int u32;
#include <stdint.h>
typedef uint8_t u8;
typedef uint32_t u32;
typedef struct {
u32 rtk1[8*16];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment