Merge branch 'email-submissions'

cad26506 · Enrico Pozzobon · 121de979 · a3a77713 · cad26506 · cad26506
Commit cad26506 authored 5 years ago by Enrico Pozzobon
36 changed files
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
@@ -4,8 +4,12 @@ Sebastien Riou, May 27th 2020

 Implementation optimized for ARM-Cortex-M0 (Size and Speed)
 */
+//define __DRYGASCON_ARM_SELECTOR_V6M__ or add drygascon128_arm_selector.h to includes

-#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+#ifndef __DRYGASCON_ARM_SELECTOR_V6M__
+#include "drygascon128_arm_selector.h"
+#endif
+#if defined(__DRYGASCON_ARM_SELECTOR_V6M__)
 .cpu cortex-m0
 .syntax unified
 .code	16

--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
@@ -3,39 +3,71 @@
 //Optional file to select the best implementation for each chip

 #ifdef STM32H743xx
-    #define __DRYGASCON_ARM_SELECTOR_V7M__
-    #define __DRYGASCON_ARM_SELECTOR_FPU__
+    #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
+    #define __DRYGASCON_ARM_SELECTOR_FOUND__
 #endif

 #ifdef STM32F746xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
+    #define __DRYGASCON_ARM_SELECTOR_FOUND__
+#endif
+
+#ifdef STM32F411xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
+    #define __DRYGASCON_ARM_SELECTOR_FOUND__
+#endif
+
+#ifdef STM32L552xx //technically it is V8M but we don't have a specific code for that one
    #define __DRYGASCON_ARM_SELECTOR_V7M__
-    #define __DRYGASCON_ARM_SELECTOR_FPU__
+    #define __DRYGASCON_ARM_SELECTOR_FOUND__
 #endif

 #ifdef STM32F103xx
    #define __DRYGASCON_ARM_SELECTOR_V7M__
+    #define __DRYGASCON_ARM_SELECTOR_FOUND__
 #endif

 #ifdef STM32L011xx
    #define __DRYGASCON_ARM_SELECTOR_V6M__
+    #define __DRYGASCON_ARM_SELECTOR_FOUND__
 #endif

 #ifdef __SAM3X8E__
    #define __DRYGASCON_ARM_SELECTOR_V7M__
+    #define __DRYGASCON_ARM_SELECTOR_FOUND__
 #endif

 //TODO: add more chips here

-#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
-    #ifdef __DRYGASCON_ARM_SELECTOR_FPU__
+#ifndef __DRYGASCON_ARM_SELECTOR_FOUND__
+    //more generic defines catching whole families
+    #if defined(STM32F4xx) || defined(STM32F7xx) || defined(STM32H7xx)
+        #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
+        #define __DRYGASCON_ARM_SELECTOR_FOUND__
+    #endif
+
+    #if defined(STM32F1xx)
+        #define __DRYGASCON_ARM_SELECTOR_V7M__
+        #define __DRYGASCON_ARM_SELECTOR_FOUND__
+    #endif
+#endif
+
+#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU__
    #define DRYGASCON_G_OPT   drygascon128_g_v7m_fpu
    #define DRYGASCON_F_OPT   drygascon128_f_v7m_fpu
    #define DRYGASCON_G0_OPT  drygascon128_g0_v7m_fpu
-    #else
+#endif
+
+#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__
+    #define DRYGASCON_G_OPT   drygascon128_g_v7m_fpu_x
+    #define DRYGASCON_F_OPT   drygascon128_f_v7m_fpu_x
+    #define DRYGASCON_G0_OPT  drygascon128_g0_v7m_fpu_x
+#endif
+
+#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
    #define DRYGASCON_G_OPT   drygascon128_g_v7m
    #define DRYGASCON_F_OPT   drygascon128_f_v7m
    #define DRYGASCON_G0_OPT  drygascon128_g0_v7m
-    #endif
 #endif

 #ifdef __DRYGASCON_ARM_SELECTOR_V6M__

--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
@@ -8,7 +8,7 @@ int crypto_aead_encrypt
     const unsigned char *npub,
     const unsigned char *k)
 {
-    return drygascon128_aead_encrypt
+    return drygascon128k16_aead_encrypt
        (c, clen, m, mlen, ad, adlen, nsec, npub, k);
 }

@@ -20,6 +20,6 @@ int crypto_aead_decrypt
     const unsigned char *npub,
     const unsigned char *k)
 {
-    return drygascon128_aead_decrypt
+    return drygascon128k16_aead_decrypt
        (m, mlen, nsec, c, clen, ad, adlen, npub, k);
 }
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
--- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	for(int i = 0; i < 10; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	for(int i = 0; i < 14; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	for(int i = 0; i < 10; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	for(int i = 0; i < 14; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tmp, tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c
@@ -8,12 +8,10 @@
 * @author   Alexandre Adomnicai, Nanyang Technological University,
 *           alexandre.adomnicai@ntu.edu.sg
 *
-* @date     May 2020
+* @date     June 2020
 ******************************************************************************/
-#include "skinny128.h"
 #include "skinnyaead.h"
 #include <string.h>
-#include <stdio.h>

 /******************************************************************************
 * x ^= y where x, y are 128-bit blocks (16 bytes array).

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c
@@ -16,12 +16,9 @@
 * @author	Alexandre Adomnicai, Nanyang Technological University,
 *			alexandre.adomnicai@ntu.edu.sg
 *
-* @date		May 2020
+* @date		June 2020
 ******************************************************************************/
-#include <stdio.h>
-#include <string.h>
 #include "skinny128.h"
-#include "tk_schedule.h"

 /******************************************************************************
 * The MixColumns computation for rounds i such that (i % 4) == 0
@@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	for(int i = 0; i < 10; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }

@@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
-	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	for(int i = 9; i >= 0; i--)
+		INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h
@@ -3,9 +3,7 @@

 #include "skinny128.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-typedef unsigned long long u64;
+typedef uint64_t    u64;

 #define TAGBYTES    16
 #define KEYBYTES    16

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c
@@ -4,16 +4,11 @@
 * @author	Alexandre Adomnicai, Nanyang Technological University,
 *			alexandre.adomnicai@ntu.edu.sg
 *
-* @date		May 2020
+* @date		June 2020
 ******************************************************************************/
-#include <stdio.h>
-#include <string.h> 		//for memcmp
-#include "tk_schedule.h"
+#include <string.h>
 #include "skinny128.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-
 /******************************************************************************
 * The round constants according to the new representation.
 ******************************************************************************/
@@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h
 #ifndef TK_SCHEDULE_H_
 #define TK_SCHEDULE_H_

-typedef unsigned char u8;
-typedef unsigned int u32;
+#include <stdint.h>
+typedef uint8_t 	u8;
+typedef uint32_t 	u32;

 void packing(u32* out, const u8* in);
 void unpacking(u8* out, u32 *in);
@@ -11,13 +12,6 @@ void precompute_rtk1(u32* rtk1, const u8* tk1);

 #define ROR(x,y) 	(((x) >> (y)) | ((x) << (32 - (y))))
 	
-#define XOR_BLOCKS(x,y) ({ 			\
-	(x)[0] ^= (y)[0];				\
-	(x)[1] ^= (y)[1];				\
-	(x)[2] ^= (y)[2];				\
-	(x)[3] ^= (y)[3];				\
-})
-	
 #define SWAPMOVE(a, b, mask, n)	({	\
 	tmp = (b ^ (a >> n)) & mask;	\
 	b ^= tmp;						\

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c
@@ -8,12 +8,10 @@
 * @author   Alexandre Adomnicai, Nanyang Technological University,
 *           alexandre.adomnicai@ntu.edu.sg
 *
-* @date     May 2020
+* @date     June 2020
 ******************************************************************************/
-#include "skinny128.h"
 #include "skinnyaead.h"
 #include <string.h>
-#include <stdio.h>

 /******************************************************************************
 * x ^= y where x, y are 128-bit blocks (16 bytes array).
@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
    u8 feedback;
    u8 tmp[2*BLOCKBYTES];
    memset(tmp, 0x00, 2*BLOCKBYTES);
-    memset(auth, 0x00, BLOCKBYTES);
    SET_DOMAIN(tmp, 0x02);
+    SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
    while (adlen >= 2*BLOCKBYTES) {
        LE_STR_64(tmp, lfsr);
        UPDATE_LFSR(lfsr);
        LE_STR_64(tmp + BLOCKBYTES, lfsr);
-        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
        precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
        skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
        xor_block(auth, tmp);
@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
        adlen -= 2*BLOCKBYTES;
        ad += 2*BLOCKBYTES;
        UPDATE_LFSR(lfsr);
+        memset(tmp, 0x00, 2*BLOCKBYTES);    // to save 32 bytes of RAM
+        SET_DOMAIN(tmp, 0x02);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
    }
    if (adlen > BLOCKBYTES) {               // pad and process 2 blocs in //
        LE_STR_64(tmp, lfsr);
@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
        LE_STR_64(tmp, lfsr);
        if (mlen == 0) {    // if tag has *NOT* been calculated yet
            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
-            skinny128_384_plus_encrypt(auth, c, ad, c, *tk); 
+            skinny128_384_plus_encrypt(tmp, c, ad, c, *tk); 
        } else {            // if tag has  been calculated yet
            precompute_rtk1(tk->rtk1, tmp, tmp);    // process last ad block
-            skinny128_384_plus_encrypt(auth, auth, ad, ad, *tk);
+            skinny128_384_plus_encrypt(tmp, tmp, ad, ad, *tk);
        }
+        xor_block(auth, tmp);
    } else if (adlen > 0) {
        LE_STR_64(tmp, lfsr);
        SET_DOMAIN(tmp, 0x03);                      // domain for padding ad
@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
        tmp[BLOCKBYTES + adlen] ^= 0x80;            // padding
        if (mlen == 0) {    // if tag has *NOT* been calculated yet
            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
-            skinny128_384_plus_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); 
+            skinny128_384_plus_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk); 
        } else {            // if tag has been calculated yet
            precompute_rtk1(tk->rtk1, tmp,  tmp);   // process last ad block
-            skinny128_384_plus_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
+            skinny128_384_plus_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
        }
+        xor_block(auth, tmp);
    }
 }


--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c
@@ -16,12 +16,9 @@
 * @author	Alexandre Adomnicai, Nanyang Technological University,
 *			alexandre.adomnicai@ntu.edu.sg
 *
-* @date		May 2020
+* @date		June 2020
 ******************************************************************************/
-#include <stdio.h>
-#include <string.h>
 #include "skinny128.h"
-#include "tk_schedule.h"

 /****************************************************************************
 * The MixColumns operation for rounds i such that (i % 4) == 0.
@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
 ****************************************************************************/
 void inv_mixcolumns_0(u32* state) {
 	u32 tmp;
@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
+* The inverse MixColumns operation for rounds i such that (i % 4) == 1
 ****************************************************************************/
 void inv_mixcolumns_1(u32* state) {
 	u32 tmp;
@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
+* The inverse MixColumns operation for rounds i such that (i % 4) == 2
 ****************************************************************************/
 void inv_mixcolumns_2(u32* state) {
 	u32 tmp;
@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
+* The inverse MixColumns operation for rounds i such that (i % 4) == 3
 ****************************************************************************/
 void inv_mixcolumns_3(u32* state) {
 	u32 tmp;
@@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
 					const u8* ptext_bis, const tweakey tk) {
 	u32 state[8];
 	packing(state, ptext, ptext_bis);
-	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
-	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
-	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
-	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
-	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
-	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
-	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
-	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
-	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
-	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
+	for(int i = 0; i < 10; i++)
+		QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
 	unpacking(ctext, ctext_bis, state);
 }

@@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
 					const u8* ctext_bis, const tweakey tk) {
 	u32 state[8];
 	packing(state, ctext, ctext_bis);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	for(int i = 9; i >= 0; i--)
+		INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
 	unpacking(ptext, ptext_bis, state);
 }
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h
 #ifndef SKINNY128_H_
 #define SKINNY128_H_
+
 #include "tk_schedule.h"

 void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h
@@ -3,9 +3,7 @@

 #include "skinny128.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-typedef unsigned long long u64;
+typedef uint64_t    u64;

 #define TAGBYTES    16
 #define KEYBYTES    16

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c
@@ -7,15 +7,11 @@
 * @author	Alexandre Adomnicai, Nanyang Technological University,
 *			alexandre.adomnicai@ntu.edu.sg
 *
-* @date		May 2020
+* @date		June 2020
 *******************************************************************************/
-#include <stdio.h>
 #include <string.h>
 #include "tk_schedule.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-
 /****************************************************************************
 * The round constants according to the fixsliced representation.
 ****************************************************************************/

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h
 #ifndef TK_SCHEDULE_BS_H_
 #define TK_SCHEDULE_BS_H_

-typedef unsigned char u8;
-typedef unsigned int u32;
+#include <stdint.h>
+
+typedef uint8_t 	u8;
+typedef uint32_t 	u32;

 typedef struct {
 	u32 rtk1[8*16];

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
 /******************************************************************************
-* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
-*
-* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+* Constant-time implementation of SKINNY-AEAD-M1(v1).
 *
 * For more details, see the paper at: https://
 *
 * @author   Alexandre Adomnicai, Nanyang Technological University,
 *           alexandre.adomnicai@ntu.edu.sg
 *
-* @date     May 2020
+* @date     June 2020
 ******************************************************************************/
-#include "skinny128.h"
 #include "skinnyaead.h"
 #include <string.h>
-#include <stdio.h>

 /******************************************************************************
 * x ^= y where x, y are 128-bit blocks (16 bytes array).
@@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
    }
    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
    precompute_rtk1(rtk1, tmp);
-    for(int i = 0; i < 16; i++) {
-        printf("%08x %08x %08x %08x\n",rtk1[i*4], rtk1[i*4+1],rtk1[i*4+2],rtk1[i*4+3]);
-    }
-    for(int i = 0; i < 56; i++) {
-        printf("%08x %08x %08x %08x\n",rtk2_3[i*4], rtk2_3[i*4+1],rtk2_3[i*4+2],rtk2_3[i*4+3]);
-    }
    skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag
    // ----------------- Process the plaintext -----------------


--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
@@ -16,12 +16,9 @@
 * @author	Alexandre Adomnicai, Nanyang Technological University,
 *			alexandre.adomnicai@ntu.edu.sg
 *
-* @date		May 2020
+* @date		June 2020
 ******************************************************************************/
-#include <stdio.h>
-#include <string.h>
 #include "skinny128.h"
-#include "tk_schedule.h"

 /******************************************************************************
 * The MixColumns computation for rounds i such that (i % 4) == 0
@@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
-	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
-	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
-	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
-	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	for(int i = 0; i < 14; i++)
+		QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }

@@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
 	u32 tmp; 					// used in SWAPMOVE macro
 	u32 state[4]; 				// 128-bit state
 	packing(state, ptext); 		// from byte to bitsliced representation
-	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
-	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
-	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
-	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
-	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
-	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
-	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
-	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
-	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
-	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
-	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
-	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
-	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
-	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	for(int i = 13; i >= 0; i--)
+		INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16);
 	unpacking(ctext, state);	// from bitsliced to byte representation
 }
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
@@ -3,9 +3,7 @@

 #include "skinny128.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-typedef unsigned long long u64;
+typedef uint64_t    u64;

 #define TAGBYTES    16
 #define KEYBYTES    16

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
@@ -4,16 +4,11 @@
 * @author	Alexandre Adomnicai, Nanyang Technological University,
 *			alexandre.adomnicai@ntu.edu.sg
 *
-* @date		May 2020
+* @date		June 2020
 ******************************************************************************/
-#include <stdio.h>
-#include <string.h> 		//for memcmp
-#include "tk_schedule.h"
+#include <string.h>
 #include "skinny128.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-
 /******************************************************************************
 * The round constants according to the new representation.
 ******************************************************************************/
@@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
 }

 /******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
+* It is equivalent to the following 2 function calls:
+* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
+* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
+* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
+* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
+* explained by the fact that less memory accesses to 'tk' are computed.
+* 
+* To save some code size, the loop can be replaced by the following one:
+*	for(int i = 0 ; i < rounds; i+=2) {
+*		lfsr2_bs(tk2);
+*		lfsr3_bs(tk3);
+*		tk[i*4+4] = tk2[0] ^ tk3[0];
+*		tk[i*4+5] = tk2[1] ^ tk3[1];
+*		tk[i*4+6] = tk2[2] ^ tk3[2];
+*		tk[i*4+7] = tk2[3] ^ tk3[3];
+*	}
+* at the cost of some cycles (~260 on ARM Cortex-M).
+******************************************************************************/
+void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) {
+	u32 tk2[4], tk3[4];
+	packing(tk2, t2);
+	packing(tk3, t3);
+	tk[0] = tk2[0] ^ tk3[0];
+	tk[1] = tk2[1] ^ tk3[1];
+	tk[2] = tk2[2] ^ tk3[2];
+	tk[3] = tk2[3] ^ tk3[3];
+	for(int i = 0 ; i < rounds; i+=8) {
+		tk2[0] ^= (tk2[2] & 0xaaaaaaaa);
+		tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa);
+		tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1);
+		tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa);
+		tk[i*4+4] = tk2[1] ^ tk3[3];
+		tk[i*4+5] = tk2[2] ^ tk3[0];
+		tk[i*4+6] = tk2[3] ^ tk3[1];
+		tk[i*4+7] = tk2[0] ^ tk3[2];
+		tk2[1] ^= (tk2[3] & 0xaaaaaaaa);
+		tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa);
+		tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1);
+		tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa);
+		tk[i*4+12] = tk2[2] ^ tk3[2];
+		tk[i*4+13] = tk2[3] ^ tk3[3];
+		tk[i*4+14] = tk2[0] ^ tk3[0];
+		tk[i*4+15] = tk2[1] ^ tk3[1];
+		tk2[2] ^= (tk2[0] & 0xaaaaaaaa);
+		tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa);
+		tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1);
+		tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa);
+		tk[i*4+20] = tk2[3] ^ tk3[1];
+		tk[i*4+21] = tk2[0] ^ tk3[2];
+		tk[i*4+22] = tk2[1] ^ tk3[3];
+		tk[i*4+23] = tk2[2] ^ tk3[0];
+		tk2[3] ^= (tk2[1] & 0xaaaaaaaa);
+		tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa);
+		tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1);
+		tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa);
+		tk[i*4+28] = tk2[0] ^ tk3[0];
+		tk[i*4+29] = tk2[1] ^ tk3[1];
+		tk[i*4+30] = tk2[2] ^ tk3[2];
+		tk[i*4+31] = tk2[3] ^ tk3[3];
+	}
+}
+
+/******************************************************************************
 * XOR TK with TK1 before applying the permutations.
 * The key is then rearranged to match the barrel shiftrows representation.
 ******************************************************************************/
@@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 	u32 test;
 	u32 tk1[4], tmp[4];
 	packing(tk1, key);
-	memcpy(tmp, tk, 16);
-	tmp[0] ^= tk1[0];
-	tmp[1] ^= tk1[1];
-	tmp[2] ^= tk1[2];
-	tmp[3] ^= tk1[3];
+	tmp[0] = tk[0] ^ tk1[0];
+	tmp[1] = tk[1] ^ tk1[1];
+	tmp[2] = tk[2] ^ tk1[2];
+	tmp[3] = tk[3] ^ tk1[3];
 	for(int i = 0 ; i < rounds; i += 8) {
 		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
 		tk[i*4] = tmp[2] & 0xf0f0f0f0;
 		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
 		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
 		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+4, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+4] ^ tk1[0];
+		tmp[1] = tk[i*4+5] ^ tk1[1];
+		tmp[2] = tk[i*4+6] ^ tk1[2];
+		tmp[3] = tk[i*4+7] ^ tk1[3];
 		if (test)
 			permute_tk_2(tmp); 					// applies P^2
 		else
@@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
 		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
 		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+12, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+12] ^ tk1[0];
+		tmp[1] = tk[i*4+13] ^ tk1[1];
+		tmp[2] = tk[i*4+14] ^ tk1[2];
+		tmp[3] = tk[i*4+15] ^ tk1[3];
 		if (test)
 			permute_tk_4(tmp); 					// applies P^4
 		else
@@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
 		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
 		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
-		memcpy(tmp, tk+i*4+20, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+20] ^ tk1[0];
+		tmp[1] = tk[i*4+21] ^ tk1[1];
+		tmp[2] = tk[i*4+22] ^ tk1[2];
+		tmp[3] = tk[i*4+23] ^ tk1[3];
 		if (test)
 			permute_tk_6(tmp); 					//	applies P^6
 		else
@@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
 		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
 		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
-		memcpy(tmp, tk+i*4+28, 16);
-		XOR_BLOCKS(tmp, tk1);
+		tmp[0] = tk[i*4+28] ^ tk1[0];
+		tmp[1] = tk[i*4+29] ^ tk1[1];
+		tmp[2] = tk[i*4+30] ^ tk1[2];
+		tmp[3] = tk[i*4+31] ^ tk1[3];
 		if (test)
 			permute_tk_8(tmp); 					// applies P^8
 		for(int j = 0; j < 4; j++) {
@@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
 ******************************************************************************/
 void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
 	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
-	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS);
 	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
 	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
 		for(int j = 0; j < 4; j++)

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
 #ifndef TK_SCHEDULE_H_
 #define TK_SCHEDULE_H_

-typedef unsigned char u8;
-typedef unsigned int u32;
+#include <stdint.h>
+
+typedef uint8_t 	u8;
+typedef uint32_t 	u32;

 void packing(u32* out, const u8* in);
 void unpacking(u8* out, u32 *in);
@@ -11,13 +13,6 @@ void precompute_rtk1(u32* rtk1, const u8* tk1);

 #define ROR(x,y) 	(((x) >> (y)) | ((x) << (32 - (y))))
 	
-#define XOR_BLOCKS(x,y) ({ 			\
-	(x)[0] ^= (y)[0];				\
-	(x)[1] ^= (y)[1];				\
-	(x)[2] ^= (y)[2];				\
-	(x)[3] ^= (y)[3];				\
-})
-	
 #define SWAPMOVE(a, b, mask, n)	({	\
 	tmp = (b ^ (a >> n)) & mask;	\
 	b ^= tmp;						\

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
@@ -8,12 +8,10 @@
 * @author   Alexandre Adomnicai, Nanyang Technological University,
 *           alexandre.adomnicai@ntu.edu.sg
 *
-* @date     May 2020
+* @date     June 2020
 ******************************************************************************/
-#include "skinny128.h"
 #include "skinnyaead.h"
 #include <string.h>
-#include <stdio.h>

 /******************************************************************************
 * x ^= y where x, y are 128-bit blocks (16 bytes array).
@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
    u8 feedback;
    u8 tmp[2*BLOCKBYTES];
    memset(tmp, 0x00, 2*BLOCKBYTES);
-    memset(auth, 0x00, BLOCKBYTES);
    SET_DOMAIN(tmp, 0x02);
+    SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
    while (adlen >= 2*BLOCKBYTES) {
        LE_STR_64(tmp, lfsr);
        UPDATE_LFSR(lfsr);
        LE_STR_64(tmp + BLOCKBYTES, lfsr);
-        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
        precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
        skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
        xor_block(auth, tmp);
@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
        adlen -= 2*BLOCKBYTES;
        ad += 2*BLOCKBYTES;
        UPDATE_LFSR(lfsr);
+        memset(tmp, 0x00, 2*BLOCKBYTES);    // to save 32 bytes of RAM
+        SET_DOMAIN(tmp, 0x02);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
    }
    if (adlen > BLOCKBYTES) {               // pad and process 2 blocs in //
        LE_STR_64(tmp, lfsr);
@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
        LE_STR_64(tmp, lfsr);
        if (mlen == 0) {    // if tag has *NOT* been calculated yet
            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
-            skinny128_384_encrypt(auth, c, ad, c, *tk); 
+            skinny128_384_encrypt(tmp, c, ad, c, *tk); 
        } else {            // if tag has  been calculated yet
            precompute_rtk1(tk->rtk1, tmp, tmp);    // process last ad block
-            skinny128_384_encrypt(auth, auth, ad, ad, *tk);
+            skinny128_384_encrypt(tmp, tmp, ad, ad, *tk);
        }
+        xor_block(auth, tmp);
    } else if (adlen > 0) {
        LE_STR_64(tmp, lfsr);
        SET_DOMAIN(tmp, 0x03);                      // domain for padding ad
@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
        tmp[BLOCKBYTES + adlen] ^= 0x80;            // padding
        if (mlen == 0) {    // if tag has *NOT* been calculated yet
            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
-            skinny128_384_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); 
+            skinny128_384_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk); 
        } else {            // if tag has been calculated yet
            precompute_rtk1(tk->rtk1, tmp,  tmp);   // process last ad block
-            skinny128_384_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
+            skinny128_384_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
        }
+        xor_block(auth, tmp);
    }
 }


--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
@@ -16,12 +16,9 @@
 * @author	Alexandre Adomnicai, Nanyang Technological University,
 *			alexandre.adomnicai@ntu.edu.sg
 *
-* @date		May 2020
+* @date		June 2020
 ******************************************************************************/
-#include <stdio.h>
-#include <string.h>
 #include "skinny128.h"
-#include "tk_schedule.h"

 /****************************************************************************
 * The MixColumns operation for rounds i such that (i % 4) == 0.
@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
 ****************************************************************************/
 void inv_mixcolumns_0(u32* state) {
 	u32 tmp;
@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
+* The inverse MixColumns operation for rounds i such that (i % 4) == 1
 ****************************************************************************/
 void inv_mixcolumns_1(u32* state) {
 	u32 tmp;
@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
+* The inverse MixColumns operation for rounds i such that (i % 4) == 2
 ****************************************************************************/
 void inv_mixcolumns_2(u32* state) {
 	u32 tmp;
@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
 }

 /****************************************************************************
-* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
+* The inverse MixColumns operation for rounds i such that (i % 4) == 3
 ****************************************************************************/
 void inv_mixcolumns_3(u32* state) {
 	u32 tmp;
@@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
 					const u8* ptext_bis, const tweakey tk) {
 	u32 state[8];
 	packing(state, ptext, ptext_bis);
-	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
-	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
-	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
-	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
-	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
-	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
-	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
-	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
-	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
-	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
-	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+320);
-	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+352);
-	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+384);
-	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+416);
+	for(int i = 0; i < 14; i++)
+		QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
 	unpacking(ctext, ctext_bis, state);
 }

@@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
 					const u8* ctext_bis, const tweakey tk) {
 	u32 state[8];
 	packing(state, ctext, ctext_bis);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+416);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+384);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+352);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+320);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
-	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	for(int i = 13; i >= 0; i--)
+		INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32);
 	unpacking(ptext, ptext_bis, state);
 }
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
@@ -3,9 +3,7 @@

 #include "skinny128.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-typedef unsigned long long u64;
+typedef uint64_t    u64;

 #define TAGBYTES    16
 #define KEYBYTES    16

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
@@ -9,13 +9,9 @@
 *
 * @date		May 2020
 *******************************************************************************/
-#include <stdio.h>
 #include <string.h>
 #include "tk_schedule.h"

-typedef unsigned char u8;
-typedef unsigned int u32;
-
 /****************************************************************************
 * The round constants according to the fixsliced representation.
 ****************************************************************************/

--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h
 #ifndef TK_SCHEDULE_BS_H_
 #define TK_SCHEDULE_BS_H_

-typedef unsigned char u8;
-typedef unsigned int u32;
+#include <stdint.h>
+
+typedef uint8_t 	u8;
+typedef uint32_t 	u32;

 typedef struct {
 	u32 rtk1[8*16];