Commit a6544c4f by Alexandre Adomnicai Committed by Enrico Pozzobon

romulus opt32

parent b9419420
......@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
......@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
......@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
for(int i = 0; i < 10; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
......@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
for(int i = 0; i < 14; i++)
QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
......@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
u32 tmp, tk2[4], tk3[4];
packing(tk2, t2);
packing(tk3, t3);
tk[0] = tk2[0] ^ tk3[0];
tk[1] = tk2[1] ^ tk3[1];
tk[2] = tk2[2] ^ tk3[2];
tk[3] = tk2[3] ^ tk3[3];
for(int i = 0 ; i < rounds; i+=8) {
tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
tk[i*4+4] = tk2[1] ^ tk3[3];
tk[i*4+5] = tk2[2] ^ tk3[0];
tk[i*4+6] = tk2[3] ^ tk3[1];
tk[i*4+7] = tk2[0] ^ tk3[2];
tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
tk[i*4+12] = tk2[2] ^ tk3[2];
tk[i*4+13] = tk2[3] ^ tk3[3];
tk[i*4+14] = tk2[0] ^ tk3[0];
tk[i*4+15] = tk2[1] ^ tk3[1];
tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
tk[i*4+20] = tk2[3] ^ tk3[1];
tk[i*4+21] = tk2[0] ^ tk3[2];
tk[i*4+22] = tk2[1] ^ tk3[3];
tk[i*4+23] = tk2[2] ^ tk3[0];
tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
tk[i*4+28] = tk2[0] ^ tk3[0];
tk[i*4+29] = tk2[1] ^ tk3[1];
tk[i*4+30] = tk2[2] ^ tk3[2];
tk[i*4+31] = tk2[3] ^ tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
......@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
tmp[0] = tk[0] ^ tk1[0];
tmp[1] = tk[1] ^ tk1[1];
tmp[2] = tk[2] ^ tk1[2];
tmp[3] = tk[3] ^ tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+4] ^ tk1[0];
tmp[1] = tk[i*4+5] ^ tk1[1];
tmp[2] = tk[i*4+6] ^ tk1[2];
tmp[3] = tk[i*4+7] ^ tk1[3];
if (test)
permute_tk_2(tmp); // applies P^2
else
......@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+12] ^ tk1[0];
tmp[1] = tk[i*4+13] ^ tk1[1];
tmp[2] = tk[i*4+14] ^ tk1[2];
tmp[3] = tk[i*4+15] ^ tk1[3];
if (test)
permute_tk_4(tmp); // applies P^4
else
......@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+20] ^ tk1[0];
tmp[1] = tk[i*4+21] ^ tk1[1];
tmp[2] = tk[i*4+22] ^ tk1[2];
tmp[3] = tk[i*4+23] ^ tk1[3];
if (test)
permute_tk_6(tmp); // applies P^6
else
......@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
tmp[0] = tk[i*4+28] ^ tk1[0];
tmp[1] = tk[i*4+29] ^ tk1[1];
tmp[2] = tk[i*4+30] ^ tk1[2];
tmp[3] = tk[i*4+31] ^ tk1[3];
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
......@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment