romulus opt32

a6544c4f · Alexandre Adomnicai · Enrico Pozzobon · b9419420 · a6544c4f · a6544c4f
Commit a6544c4f authored Jun 02, 2020 by Alexandre Adomnicai Committed by Enrico Pozzobon Jun 03, 2020
8 changed files
--- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	for(int i = 0; i < 10; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	for(int i = 0; i < 14; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)
@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 void precompute_rtk1(u32* rtk1, const u8* tk1) {
 	memset(rtk1, 0x00, 16*16);
 	permute_tk(rtk1, tk1, 16);
-}
\ No newline at end of file
+}
--- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	for(int i = 0; i < 10; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	for(int i = 0; i < 14; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)
@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 void precompute_rtk1(u32* rtk1, const u8* tk1) {
 	memset(rtk1, 0x00, 16*16);
 	permute_tk(rtk1, tk1, 16);
-}
\ No newline at end of file
+}