skinny & romulus

40fde2ff · Alexandre Adomnicai · Enrico Pozzobon · 9fb00266 · 40fde2ff · 40fde2ff
Commit 40fde2ff authored May 13, 2020 by Alexandre Adomnicai Committed by Enrico Pozzobon May 13, 2020
115 changed files
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/LWC_AEAD_KAT_128_128.txt
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/LWC_AEAD_KAT_128_128.txt
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk[4*SKINNY128_384_ROUNDS];    //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/armcortexm/skinny128.s
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,  
+					const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/tk_schedule.c
--- a/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v12/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk[4*SKINNY128_384_ROUNDS];    //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_128_ROUNDS	40
+#define SKINNY128_256_ROUNDS	48
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.s
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/LWC_AEAD_KAT_128_128.txt
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/LWC_AEAD_KAT_128_128.txt
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                     //to manipulate tk1 in a byte-wise manner
+    u32 rtk1[32];                   //to avoid recomputation of the tk schedule
+    u32 rtk[4*SKINNY128_384_ROUNDS];//all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/armcortexm/skinny128.s
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,  
+					const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/tk_schedule.c
--- a/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v12/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                     //to manipulate tk1 in a byte-wise manner
+    u32 rtk1[32];                   //to avoid recomputation of the tk schedule
+    u32 rtk[4*SKINNY128_384_ROUNDS];//all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_128_ROUNDS	40
+#define SKINNY128_256_ROUNDS	48
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.s
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/encrypt.c
+#include "skinny128.h"
+#include "tk_schedule.h"
+#include "romulus.h"
+#include <string.h>
+#include <stdio.h>
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    int i;
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                          //56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    //Handle the special case of no associated data
+    if (adlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+    } else {
+        // Process all double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        //Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {
+            // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {
+            //  Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {
+            //  Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {
+            // Left-over partial single block
+            for(i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;
+            SET_DOMAIN(tks, 0x1A);
+        }
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the plaintext -----------------
+    memset(tks.tk1, 0, KEYBYTES);
+    tks.tk1[0] = 0x01;          //init the 56-bit LFSR counter
+    if (mlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        //process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (mlen > BLOCKBYTES) {
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        //pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (mlen < BLOCKBYTES) {
+            for(i = 0; i < (int)mlen; i++) {
+                tmp = m[i];         //use of tmp variable just in case 'c = m'
+                c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= (u8)tmp;
+            }
+            state[15] ^= (u8)mlen; //padding
+            SET_DOMAIN(tks, 0x15);
+        } else {
+            RHO(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        c += mlen;
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate the tag -----------------
+    G(state,state);
+    memcpy(c, state, TAGBYTES);
+    // ----------------- Generate the tag -----------------
+
+    return 0;
+}
+
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    int i;
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                          //56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    //Handle the special case of no associated data
+    if (adlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        // Process all double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        //Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {
+            // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {
+            //  Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);  
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {
+            //  Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {
+            // Left-over partial single block
+            for(i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;
+            SET_DOMAIN(tks, 0x1A);
+        }
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memset(tks.tk1, 0, KEYBYTES);
+    tks.tk1[0] = 0x01;          //init the 56-bit LFSR counter
+    if (clen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        //process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (clen > BLOCKBYTES) {
+            RHO_INV(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        }
+        //pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (clen < BLOCKBYTES) {
+            for(i = 0; i < (int)clen; i++) {
+                m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= m[i];
+            }
+            state[15] ^= (u8)clen; //padding
+            SET_DOMAIN(tks, 0x15);
+        } else {
+            RHO_INV(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c[clen+i];   //constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/LWC_AEAD_KAT_128_128.txt
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/LWC_AEAD_KAT_128_128.txt
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384(c, rtk2_3, m, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        mlen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update lfsr for next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (mlen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        for(i = 0; i < mlen; i++)
+            sum[i] ^= m[i];             // sum for tag computation
+        sum[i] ^= 0x80;                 // padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1); // encrypt 'auth' = 0^16
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];      // encrypted padded block
+        c += mlen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1);  // compute the tag
+    memcpy(c, sum, TAGBYTES);
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(sum, rtk2_3, ad, rtk1);   // use 'sum' as tmp array
+        xor_block(auth, sum);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] = 0x80;              // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(c, auth);                 // XOR for tag computation
+    // ----------------- Process the associated data -----------------
+    
+    return 0;
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384_inv(m, rtk2_3, c, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        clen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update LFSR for the next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (clen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];      // encrypted padded block
+            sum[i] ^= m[i];             // sum for tag computation
+        }
+        sum[i] ^= 0x80;                 // padding
+        c += clen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(tmp + BLOCKBYTES, rtk2_3, ad, rtk1);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(sum, auth);               // XOR for tag computation
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];      // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, const u32* rtk2_3, const u8* ptext, const u32* rtk1);
+extern void skinny128_384_inv(u8* ptext, const u32* rtk2_3, const u8* ctext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk2_3, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk2_3);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/skinny128.s
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_1/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/encrypt.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void skinny128_384_inv(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void tkschedule_lfsr_2(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void pack_tk1(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void tkschedule_lfsr_3(u32* rtk, const u8* tk3, const u8* tk3_bis, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/skinny128.s
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/armcortexm_2/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    precompute_rtk2_3(rtk2_3, npub, k);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(c + mlen, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);     // precompute RTK1 given the LFSR
+        skinny128_384_plus_encrypt(c, m, rtk1, rtk2_3);
+        xor_block(c + mlen, m);         // sum for tag computation
+        mlen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update lfsr for next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (mlen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        for(i = 0; i < mlen; i++)
+            c[mlen + i] ^= m[i];        // sum for tag computation
+        c[mlen + i] ^= 0x80;            // padding
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];      // encrypted padded block
+        c += mlen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    precompute_rtk1(rtk1, tmp);
+    skinny128_384_plus_encrypt(c, c, rtk1, rtk2_3); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        precompute_rtk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384_plus_encrypt(tmp, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+    }
+    xor_block(c, auth);                 // XOR for tag computation
+    // ----------------- Process the associated data -----------------
+    return 0;
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    precompute_rtk2_3(rtk2_3, npub, k);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);     // precompute RTK1 given the LFSR
+        skinny128_384_plus_decrypt(m, c, rtk1, rtk2_3);
+        xor_block(sum, m);              // sum for tag computation
+        clen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update LFSR for the next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (clen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];      // encrypted padded block
+            sum[i] ^= m[i];             // sum for tag computation
+        }
+        sum[i] ^= 0x80;                 // padding
+        c += clen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    precompute_rtk1(rtk1, tmp);
+    skinny128_384_plus_encrypt(sum, sum, rtk1, rtk2_3); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        precompute_rtk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384_plus_encrypt(tmp, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+    }
+    xor_block(sum, auth);               // XOR for tag computation
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];      // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, 
+				const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
+
+/******************************************************************************
+* Decryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, 
+				const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+#include "tk_schedule.h"
+
+void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({		\
+	inv_mixcolumns_3(state);							\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	inv_mixcolumns_2(state); 							\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	inv_mixcolumns_1(state); 							\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	inv_mixcolumns_0(state); 							\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+})
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/tk_schedule.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_1/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/encrypt.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/skinny128.c
+/******************************************************************************
+* Fixsliced implementation of SKINNY-128-384.
+* Two blocks are processed in parallel.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. However, the Sbox
+* permutation requires 8 rounds for a synchronization. To limit the impact
+* on code size, we compute the permutation every 4 rounds. Therefore, this
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* For more details, see the paper at: https://
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 0.
+****************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 1.
+****************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 2.
+****************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 3.
+****************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
+****************************************************************************/
+void inv_mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
+****************************************************************************/
+void inv_mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
+****************************************************************************/
+void inv_mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
+****************************************************************************/
+void inv_mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* Adds the tweakey (including the round constants) to the state.
+****************************************************************************/
+void add_tweakey(u32* state, const u32* rtk1, const u32* rtk2_3) {
+	state[0] ^= rtk1[0] ^ rtk2_3[0];
+	state[1] ^= rtk1[1] ^ rtk2_3[1]; 
+	state[2] ^= rtk1[2] ^ rtk2_3[2];
+	state[3] ^= rtk1[3] ^ rtk2_3[3];
+	state[4] ^= rtk1[4] ^ rtk2_3[4];
+	state[5] ^= rtk1[5] ^ rtk2_3[5];
+	state[6] ^= rtk1[6] ^ rtk2_3[6];
+	state[7] ^= rtk1[7] ^ rtk2_3[7];
+}
+
+/****************************************************************************
+* Encryption of 2 blocks in parallel using SKINNY-128-384.
+* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
+* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
+****************************************************************************/
+void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk) {
+	u32 state[8];
+	packing(state, ptext, ptext_bis);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
+	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
+	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
+	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
+	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
+	unpacking(ctext, ctext_bis, state);
+}
+
+/****************************************************************************
+* Decryption of 2 blocks in parallel using SKINNY-128-384.
+* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
+* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
+****************************************************************************/
+void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, 
+					const u8* ctext_bis, const tweakey tk) {
+	u32 state[8];
+	packing(state, ctext, ctext_bis);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	unpacking(ptext, ptext_bis, state);
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+#include "tk_schedule.h"
+
+void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk);
+
+void skinny128_384_plus_decrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= (state[0] | state[1]);					\
+	state[7] ^= (state[4] | state[5]);					\
+	state[1] ^= (state[6] | state[5]);					\
+	state[2] ^= (state[3] & state[7]);					\
+	state[6] ^= (~state[7] | state[4]);					\
+	state[0] ^= (state[2] | ~state[1]);					\
+	state[4] ^= (~state[3] | state[2]);					\
+	state[5] ^= (state[6] & state[0]);					\
+	add_tweakey(state, rtk1, rtk2_3); 					\
+	mixcolumns_0(state);								\
+	state[4] ^= (state[2] | state[3]);					\
+	state[5] ^= (state[6] | state[1]);					\
+	state[3] ^= (state[0] | state[1]);					\
+	state[7] ^= (state[4] & state[5]);					\
+	state[0] ^= (~state[5] | state[6]);					\
+	state[2] ^= (state[7] | ~state[3]);					\
+	state[6] ^= (~state[4] | state[7]);					\
+	state[1] ^= (state[0] & state[2]);					\
+	add_tweakey(state, rtk1+8, rtk2_3+8); 				\
+	mixcolumns_1(state);								\
+	state[6] ^= (state[7] | state[4]);					\
+	state[1] ^= (state[0] | state[3]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	state[5] ^= (state[6] & state[1]);					\
+	state[2] ^= (~state[1] | state[0]);					\
+	state[7] ^= (state[5] | ~state[4]);					\
+	state[0] ^= (~state[6] | state[5]);					\
+	state[3] ^= (state[2] & state[7]);					\
+	add_tweakey(state, rtk1+16, rtk2_3+16); 			\
+	mixcolumns_2(state);								\
+	state[0] ^= (state[5] | state[6]);					\
+	state[3] ^= (state[2] | state[4]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	state[1] ^= (state[0] & state[3]);					\
+	state[7] ^= (~state[3] | state[2]);					\
+	state[5] ^= (state[1] | ~state[6]);					\
+	state[2] ^= (~state[0] | state[1]);					\
+	state[4] ^= (state[7] & state[5]);					\
+	add_tweakey(state, rtk1+24, rtk2_3+24); 			\
+	mixcolumns_3(state);								\
+	state[0] ^= state[1]; 								\
+	state[1] ^= state[0]; 								\
+	state[0] ^= state[1]; 								\
+	state[2] ^= state[3]; 								\
+	state[3] ^= state[2]; 								\
+	state[2] ^= state[3]; 								\
+	state[4] ^= state[7]; 								\
+	state[7] ^= state[4]; 								\
+	state[4] ^= state[7]; 								\
+	state[5] ^= state[6]; 								\
+	state[6] ^= state[5]; 								\
+	state[5] ^= state[6]; 								\
+})
+
+#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({		\
+	state[0] ^= state[1]; 								\
+	state[1] ^= state[0]; 								\
+	state[0] ^= state[1]; 								\
+	state[2] ^= state[3]; 								\
+	state[3] ^= state[2]; 								\
+	state[2] ^= state[3]; 								\
+	state[4] ^= state[7]; 								\
+	state[7] ^= state[4]; 								\
+	state[4] ^= state[7]; 								\
+	state[5] ^= state[6]; 								\
+	state[6] ^= state[5]; 								\
+	state[5] ^= state[6]; 								\
+	inv_mixcolumns_3(state);							\
+	add_tweakey(state, rtk1+24, rtk2_3+24); 			\
+	state[4] ^= (state[7] & state[5]);					\
+	state[2] ^= (~state[0] | state[1]);					\
+	state[5] ^= (state[1] | ~state[6]);					\
+	state[7] ^= (~state[3] | state[2]);					\
+	state[1] ^= (state[0] & state[3]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	state[3] ^= (state[2] | state[4]);					\
+	state[0] ^= (state[5] | state[6]);					\
+	inv_mixcolumns_2(state);							\
+	add_tweakey(state, rtk1+16, rtk2_3+16); 			\
+	state[3] ^= (state[2] & state[7]);					\
+	state[0] ^= (~state[6] | state[5]);					\
+	state[7] ^= (state[5] | ~state[4]);					\
+	state[2] ^= (~state[1] | state[0]);					\
+	state[5] ^= (state[6] & state[1]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	state[1] ^= (state[0] | state[3]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	inv_mixcolumns_1(state);							\
+	add_tweakey(state, rtk1+8, rtk2_3+8); 				\
+	state[1] ^= (state[0] & state[2]);					\
+	state[6] ^= (~state[4] | state[7]);					\
+	state[2] ^= (state[7] | ~state[3]);					\
+	state[0] ^= (~state[5] | state[6]);					\
+	state[7] ^= (state[4] & state[5]);					\
+	state[3] ^= (state[0] | state[1]);					\
+	state[5] ^= (state[6] | state[1]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	inv_mixcolumns_0(state); 							\
+	add_tweakey(state, rtk1, rtk2_3); 					\
+	state[5] ^= (state[6] & state[0]);					\
+	state[4] ^= (~state[3] | state[2]);					\
+	state[0] ^= (state[2] | ~state[1]);					\
+	state[6] ^= (~state[7] | state[4]);					\
+	state[2] ^= (state[3] & state[7]);					\
+	state[1] ^= (state[6] | state[5]);					\
+	state[7] ^= (state[4] | state[5]);					\
+	state[3] ^= (state[0] | state[1]);					\
+})
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/tk_schedule.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadm1+v11/opt32_2/tk_schedule.h
+#ifndef TK_SCHEDULE_BS_H_
+#define TK_SCHEDULE_BS_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+typedef struct {
+	u32 rtk1[8*16];
+	u32 rtk2_3[8*40];
+} tweakey;
+	
+void packing(u32* out, const u8* block0, const u8* block1);
+void unpacking(u8* out, u8* out_bis, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3, int rounds);
+void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
+
+#define LFSR2(tk) ({				\
+	tmp = (tk)[0] ^ (tk)[2];		\
+	(tk)[0] = (tk)[1]; 				\
+	(tk)[1] = (tk)[2];				\
+	(tk)[2] = (tk)[3];				\
+	(tk)[3] = (tk)[4];				\
+	(tk)[4] = (tk)[5];				\
+	(tk)[5] = (tk)[6];				\
+	(tk)[6] = (tk)[7];				\
+	(tk)[7] = tmp;					\
+})
+
+#define LFSR3(tk) ({				\
+	tmp = (tk)[7] ^ (tk)[1]; 		\
+	(tk)[7] = (tk)[6];				\
+	(tk)[6] = (tk)[5];				\
+	(tk)[5] = (tk)[4];				\
+	(tk)[4] = (tk)[3];				\
+	(tk)[3] = (tk)[2];				\
+	(tk)[2] = (tk)[1];				\
+	(tk)[1] = (tk)[0];				\
+	(tk)[0] = tmp;					\
+})
+
+#define XOR_BLOCK(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+	(x)[4] ^= (y)[4];				\
+	(x)[5] ^= (y)[5];				\
+	(x)[6] ^= (y)[6];				\
+	(x)[7] ^= (y)[7];				\
+})
+
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24; 
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#endif  // TK_SCHEDULE_BS_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[BLOCKBYTES], auth[BLOCKBYTES], sum [BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384(c, rtk2_3, m, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        mlen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update lfsr for next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (mlen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        for(i = 0; i < mlen; i++)
+            sum[i] ^= m[i];             // sum for tag computation
+        sum[i] ^= 0x80;                 // padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1); // encrypt 'auth' = 0^16
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];      // encrypted padded block
+        c += mlen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1);  // compute the tag
+    memcpy(c, sum, TAGBYTES);
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(sum, rtk2_3, ad, rtk1);   // use 'sum' as tmp array
+        xor_block(auth, sum);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] = 0x80;              // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(c, auth);                 // XOR for tag computation
+    // ----------------- Process the associated data -----------------
+    return 0;
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384_inv(m, rtk2_3, c, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        clen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update LFSR for the next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (clen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];      // encrypted padded block
+            sum[i] ^= m[i];             // sum for tag computation
+        }
+        sum[i] ^= 0x80;                 // padding
+        c += clen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(tmp + BLOCKBYTES, rtk2_3, ad, rtk1);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(sum, auth);               // XOR for tag computation
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];      // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, const u32* rtk2_3, const u8* ptext, const u32* rtk1);
+extern void skinny128_384_inv(u8* ptext, const u32* rtk2_3, const u8* ctext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk2_3, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk2_3);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.s
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/encrypt.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void skinny128_384_inv(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void tkschedule_lfsr_2(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void pack_tk1(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void tkschedule_lfsr_3(u32* rtk, const u8* tk3, const u8* tk3_bis, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.s
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/crypto_aead.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/api.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/crypto_aead.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h