Merge branch 'email-submissions'

04be7d51 · Enrico Pozzobon · c5473c21 · 6ec7ff92 · 04be7d51 · 04be7d51
Commit 04be7d51 authored Apr 08, 2020 by Enrico Pozzobon
48 changed files
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/api.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/bytes_utils.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/bytes_utils.h
+
+#ifndef __BYTES_UTILS_H__
+#define __BYTES_UTILS_H__
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+//#ifndef bytes_utiles_printf
+//#define bytes_utiles_printf printf
+//#endif
+#ifndef bytes_utiles_printf
+#define bytes_utiles_printf printf
+#endif
+
+//replace 0 by .
+static void print_diff_byte(uint8_t d, const char *sep){
+    unsigned int n=d>>4;
+    if(0==n) bytes_utiles_printf("."); else bytes_utiles_printf("%X",n);
+    n = d & 0xF;
+    if(0==n) bytes_utiles_printf("."); else bytes_utiles_printf("%X",n);
+    bytes_utiles_printf("%s",sep);
+}
+static void print_diff_bytes_sep(const char *msg,const void *vbuf, unsigned int size, const char *m2, const char *sep){
+    const uint8_t*const buf = (const uint8_t*const)vbuf;
+    bytes_utiles_printf("%s",msg);
+    if(size){
+        unsigned int i;
+        for(i=0;i<size-1;i++) print_diff_byte(buf[i],sep);
+        print_diff_byte(buf[i],"");
+    }
+    bytes_utiles_printf("%s", m2);
+}
+
+static void print_bytes_sep(const char *msg,const void *vbuf, unsigned int size, const char *m2, const char *sep){
+    const uint8_t*const buf = (const uint8_t*const)vbuf;
+    bytes_utiles_printf("%s",msg);
+    if(size){
+        unsigned int i;
+        for(i=0;i<size-1;i++) bytes_utiles_printf("%02X%s",buf[i],sep);
+        bytes_utiles_printf("%02X",buf[i]);
+    }
+    bytes_utiles_printf("%s", m2);
+}
+static void print_bytes(const char *m,const void *buf, unsigned int size, const char *m2){print_bytes_sep(m,buf,size,m2," ");}
+static void println_bytes(const char *m,const void *buf, unsigned int size){print_bytes(m,buf,size,"\n");}
+static void print_128(const char *m, const uint8_t a[16], const char *m2){
+	print_bytes_sep( m,a   ,4,"_","");
+	print_bytes_sep("",a+4 ,4,"_","");
+	print_bytes_sep("",a+8 ,4,"_","");
+	print_bytes_sep("",a+12,4,m2 ,"");
+}
+static void println_128(const char m[], const uint8_t a[16]){print_128(m,a,"\n");}
+
+static void xor_bytes( uint8_t *d, const uint8_t *s, size_t size ){
+    for(size_t i=0;i<size;i++)
+		d[i] ^= s[i];
+}
+
+static int hexdigit_value(char c){
+	int nibble = -1;
+	if(('0'<=c) && (c<='9')) nibble = c-'0';
+	if(('a'<=c) && (c<='f')) nibble = c-'a' + 10;
+	if(('A'<=c) && (c<='F')) nibble = c-'A' + 10;
+	return nibble;
+}
+
+static int is_hexdigit(char c){
+	return -1!=hexdigit_value(c);
+}
+
+static size_t hexstr_to_bytes(uint8_t *dst, size_t dst_size, const char *const hexstr){
+	unsigned int len = strlen(hexstr);
+	if(dst_size>(len/2))
+		dst_size = (len/2);
+	memset(dst,0,dst_size);
+	for(unsigned int i=0;i<dst_size*2;i++){
+		unsigned int shift = 4 - 4*(i & 1);
+		unsigned int charIndex = i;//len-1-i;
+		char c = hexstr[charIndex];
+		uint8_t nibble = hexdigit_value(c);
+		dst[i/2] |= nibble << shift;
+	}
+	return dst_size;
+}
+
+static void bytes_to_hexstr(char *dst,uint8_t *bytes, unsigned int nBytes){
+	unsigned int i;
+	for(i=0;i<nBytes;i++){
+		sprintf(dst+2*i,"%02X",bytes[i]);
+	}
+}
+static size_t cleanup_hexstr(char *hexstr, size_t hexstr_size, char *str, size_t str_size){
+	size_t cnt=0;
+	int lastIs0=0;
+	for(unsigned int j = 0;j<str_size;j++){
+		char c = str[j];
+		if(is_hexdigit(c)){
+			if(cnt==hexstr_size-1){//need final char for null.
+				printf("Too many hex digits. hexstr=%s\n",hexstr);
+				hexstr[cnt]=0;
+				return -1;
+			}
+			hexstr[cnt++]=c;
+		} else if(lastIs0) {
+			if('x'==c) cnt--;
+			if('X'==c) cnt--;
+		}
+		lastIs0 = '0'==c;
+	}
+	hexstr[cnt]=0;
+	return cnt;
+}
+static size_t user_hexstr_to_bytes(uint8_t*out, size_t out_size, char *str, size_t str_size){
+	size_t hexstr_size = cleanup_hexstr(str,str_size,str,str_size);
+	size_t conv_size = (hexstr_size/2) < out_size ? hexstr_size/2 : out_size;
+	return hexstr_to_bytes(out,conv_size,str);
+}
+
+static void bytes_utils_remove_unused_warnings(void){
+    (void)println_bytes;
+    (void)println_128;
+    (void)xor_bytes;
+    (void)bytes_to_hexstr;
+    (void)user_hexstr_to_bytes;
+    (void)print_diff_bytes_sep;
+}
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drygascon128_le32.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drygascon128_le32.h
+/**
+DryGascon128
+Sebastien Riou, January 27th 2019
+c99 ref implementation meant to fit in the supercop framework
+*/
+#ifndef __DRYGASCON128_H__
+#define __DRYGASCON128_H__
+
+#define DRYSPONGE_DBG_EN 0
+//#define DRYSPONGE_ACCUMULATE_SAFE_AND_SLOW
+#define DRYSPONGE_KEYSIZE 16
+#define DRYSPONGE_NONCESIZE 16
+#define DRYSPONGE_BLOCKSIZE 16
+#define DRYSPONGE_CAPACITYSIZE (5*64/8)
+#define DRYSPONGE_XSIZE (4*32/8)
+//remove one round because Mix does 1 round merely for processing the upper
+//2 bits of the domain separator (because 128+4 mod 10 is 2)
+#define DRYSPONGE_INIT_ROUNDS (12-1)
+#define DRYSPONGE_ROUNDS (8-1)
+#define DRYSPONGE_ACCUMULATE_FACTOR 2
+#define DRYSPONGE_MPR_INPUT_WIDTH 10
+
+#include "drygascon_le32.h"
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drygascon_le32.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drygascon_le32.h
+/**
+DryGascon_le32
+Sebastien Riou, January 6th 2019
+c99 little endian 32 bit implementation meant to fit in the supercop framework
+*/
+#ifndef __DRYGASCON_H__
+#define __DRYGASCON_H__
+
+#include <stdint.h>
+typedef uint64_t DRYSPONGE_EXT_t;
+
+#define DRYSPONGE_EXT
+
+#include "drysponge_common.h"
+
+//input width for one round of MixPhaseRound
+#define DRYSPONGE_MPR_INPUT_MASK ((((uint64_t)1)<<DRYSPONGE_MPR_INPUT_WIDTH)-1)
+
+#define DRYSPONGE_MPR_ROUNDS DRYSPONGE_DIVUP((DRYSPONGE_BLOCKSIZE*8)+4,DRYSPONGE_MPR_INPUT_WIDTH)
+
+#if (DRYSPONGE_MPR_ROUNDS*DRYSPONGE_MPR_INPUT_WIDTH-4)<(DRYSPONGE_BLOCKSIZE*8)
+    #error "(DRYSPONGE_MPR_ROUNDS*DRYSPONGE_MPR_INPUT_WIDTH-4)<(DRYSPONGE_BLOCKSIZE*8)"
+#endif
+
+#if DRYSPONGE_XSIZE32>16
+    #error "DRYSPONGE_XSIZE32>16"
+#endif
+
+#if DRYSPONGE_XSIZE32 == 4
+    #define DRYSPONGE_X_IDX_WIDTH 2
+#endif
+
+#if DRYSPONGE_MPR_INPUT_WIDTH == 10
+    #define DRYSPONGE_RANK_BYTES 2
+    typedef uint32_t permut_rank_t;
+#endif
+#if DRYSPONGE_MPR_INPUT_WIDTH == 18
+    #define DRYSPONGE_RANK_BYTES 3
+    typedef uint32_t permut_rank_t;
+#endif
+
+#define DRYSPONGE_X_IDX_MASK ((1<<DRYSPONGE_X_IDX_WIDTH)-1)
+
+
+#ifndef DRYSPONGE_OPT_F
+DRYSPONGE_FUNC void DRYSPONGE_DomainSeparator(
+    DRYSPONGE_EXT_t *const ext,
+    unsigned int dsinfo
+){
+    *ext = dsinfo;
+    *ext = *ext<<((DRYSPONGE_BLOCKSIZE*8)%DRYSPONGE_MPR_INPUT_WIDTH);
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_MixPhaseRound(
+    DRYSPONGE_EXT_t ext,
+    uint64_t *const c64,
+    uint64_t *const x64,
+    const uint8_t *const in,
+    unsigned int bitidx,
+    unsigned int insize
+){
+    uint32_t *const x32 = (uint32_t*const)x64;
+    unsigned int bi = bitidx/8;
+    unsigned int shift = bitidx%8;
+    permut_rank_t r=0;
+    uint8_t *rb = (uint8_t*)&r;
+    for(unsigned int i=0;i<DRYSPONGE_RANK_BYTES;i++){
+        if(bi+i==insize) break;
+        rb[i]=in[bi+i];
+    }
+    r = (r>>shift) & DRYSPONGE_MPR_INPUT_MASK;
+    r^=ext;
+    for(unsigned int j=0;j<DRYSPONGE_CAPACITYSIZE64;j++){
+        unsigned int i = r & DRYSPONGE_X_IDX_MASK;
+        r = r >> DRYSPONGE_X_IDX_WIDTH;
+        c64[j]^=x32[i];
+    }
+}
+#endif
+
+struct DRYSPONGE_struct_t;
+typedef struct DRYSPONGE_struct_t DRYSPONGE_t ;
+DRYSPONGE_FUNC void DRYSPONGE_MixPhase(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const in
+);
+DRYSPONGE_FUNC void DRYSPONGE_CoreRound(
+    DRYSPONGE_t *const ctx,
+    unsigned int r
+);
+
+#include "drysponge_le32.h"
+
+#ifndef DRYSPONGE_OPT_F
+DRYSPONGE_FUNC void DRYSPONGE_MixPhase(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const in
+){
+    unsigned int bitidx=0;
+    #if DRYSPONGE_MPR_ROUNDS > 1
+    for(unsigned int i=0;i<DRYSPONGE_MPR_ROUNDS-1;i++){
+        #if DRYSPONGE_DBG_EN >= 4
+            printf("Mix phase MixPhaseRound entry %lu:\n",i);
+            DRYSPONGE_print_state(ctx);
+        #endif
+        DRYSPONGE_EXT_t ext=0;
+        #if ((DRYSPONGE_MPR_ROUNDS-1)*(DRYSPONGE_MPR_INPUT_WIDTH))>(DRYSPONGE_BLOCKSIZE*8)
+        if((ctx->ext) && (i==(DRYSPONGE_MPR_ROUNDS-2))){
+            //DS info is split accross this block and the last one
+            ext = ctx->ext;
+            ctx->ext = ctx->ext >> ((DRYSPONGE_BLOCKSIZE*8)%DRYSPONGE_MPR_INPUT_WIDTH);
+            ctx->ext = ctx->ext >> ((((DRYSPONGE_MPR_ROUNDS-1)*DRYSPONGE_MPR_INPUT_WIDTH))-(DRYSPONGE_BLOCKSIZE*8));
+        }
+        #endif
+        DRYSPONGE_MixPhaseRound(ext,ctx->c,ctx->x,in,bitidx,DRYSPONGE_BLOCKSIZE);
+        bitidx+=DRYSPONGE_MPR_INPUT_WIDTH;
+        #if DRYSPONGE_DBG_EN >= 4
+            printf("Mix phase CoreRound entry     %lu:\n",i);
+            DRYSPONGE_print_state(ctx);
+        #endif
+        DRYSPONGE_CoreRound(ctx,0);
+    }
+    #endif
+    #if DRYSPONGE_DBG_EN >= 4
+        printf("Mix phase MixPhaseRound entry %lu:\n",DRYSPONGE_MPR_ROUNDS-1);
+        DRYSPONGE_print_state(ctx);
+    #endif
+    DRYSPONGE_MixPhaseRound(ctx->ext,ctx->c,ctx->x,in,bitidx,DRYSPONGE_BLOCKSIZE);
+    ctx->ext=0;
+}
+#endif
+
+//#ifndef DRYSPONGE_OPT_G //keep for now, needed for key init
+DRYSPONGE_FUNC void gascon_sboxes(uint64_t * const x, unsigned int nw){
+    uint64_t t[DRYSPONGE_CAPACITYSIZE64];
+    const unsigned int mid = nw/2;
+    for(unsigned int i=0;i<mid+1;i++){
+        unsigned int dst = 2*i;
+        unsigned int src = (nw+dst-1) % nw;
+        x[dst] ^= x[src];
+    }
+    for(unsigned int i=0;i<nw;i++){
+        t[i] = (x[i] ^ 0xFFFFFFFFFFFFFFFFull) & x[(i+1)%nw];
+    }
+    for(unsigned int i=0;i<nw;i++){
+        x[i] ^= t[(i+1)%nw];
+    }
+    for(unsigned int i=0;i<mid+1;i++){
+        unsigned int src = 2*i;
+        unsigned int dst = (src+1) % nw;
+        x[dst] ^= x[src];
+    }
+    x[mid] ^= 0XFFFFFFFFFFFFFFFFull;
+}
+
+DRYSPONGE_FUNC uint64_t gascon_rotr64_interleaved(uint64_t in, unsigned int shift){
+    uint32_t i[2];
+    memcpy(i,&in,sizeof(i));
+    unsigned int shift2 = shift/2;
+    if(shift & 1){
+        uint32_t tmp = DRYSPONGE_ROTR32(i[1],shift2);
+        i[1] = DRYSPONGE_ROTR32(i[0],(shift2+1)%32);
+        i[0] = tmp;
+    }else{
+        i[0] = DRYSPONGE_ROTR32(i[0],shift2);
+        i[1] = DRYSPONGE_ROTR32(i[1],shift2);
+    }
+    uint64_t out;
+    memcpy(&out,i,sizeof(i));
+    return out;
+}
+DRYSPONGE_FUNC void gascon_add_cst(uint64_t* x, unsigned int round) {
+    const unsigned int mid = DRYSPONGE_CAPACITYSIZE64 / 2;
+    unsigned int rounds = 12;
+    const unsigned int r = 12-rounds+round;
+    // addition of round constant
+    x[mid] ^= ((0xfull - r) << 4) | r;
+}
+DRYSPONGE_FUNC void gascon_lin_layer(uint64_t* x) {
+    // linear diffusion layer
+    x[0] ^= gascon_rotr64_interleaved(x[0], 19) ^ gascon_rotr64_interleaved(x[0], 28);
+    x[1] ^= gascon_rotr64_interleaved(x[1], 61) ^ gascon_rotr64_interleaved(x[1], 38);
+    x[2] ^= gascon_rotr64_interleaved(x[2],  1) ^ gascon_rotr64_interleaved(x[2],  6);
+    x[3] ^= gascon_rotr64_interleaved(x[3], 10) ^ gascon_rotr64_interleaved(x[3], 17);
+    x[4] ^= gascon_rotr64_interleaved(x[4],  7) ^ gascon_rotr64_interleaved(x[4], 40);
+    #if DRYSPONGE_CAPACITYSIZE64 > 5
+        x[5] ^= gascon_rotr64_interleaved(x[5], 31) ^ gascon_rotr64_interleaved(x[5], 26);
+        x[6] ^= gascon_rotr64_interleaved(x[6], 53) ^ gascon_rotr64_interleaved(x[6], 58);
+        x[7] ^= gascon_rotr64_interleaved(x[7],  9) ^ gascon_rotr64_interleaved(x[7], 46);
+        x[8] ^= gascon_rotr64_interleaved(x[8], 43) ^ gascon_rotr64_interleaved(x[8], 50);
+    #endif
+}
+DRYSPONGE_FUNC void gascon_permutation_round(uint64_t* S, unsigned int round) {
+    (void)DRYSPONGE_rotr64;
+    // addition of round constant
+    gascon_add_cst(S, round);
+    // substitution layer
+    gascon_sboxes(S,DRYSPONGE_CAPACITYSIZE64);
+    // linear diffusion layer
+    gascon_lin_layer(S);
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_CoreRound(
+    DRYSPONGE_t *const ctx,
+    unsigned int r
+){
+    gascon_permutation_round(ctx->c, r);
+}
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge.h
+#include "drygascon128_le32.h"
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_common.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_common.h
+#ifndef __DRYSPONGE_COMMON_H__
+#define __DRYSPONGE_COMMON_H__
+
+#ifndef DRYSPONGE_FUNC
+#define DRYSPONGE_FUNC inline static
+#endif
+
+//convention:
+//  width means length in bits
+//  size means length in bytes
+
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#if DRYSPONGE_DBG_EN
+#include "bytes_utils.h"
+#endif
+
+#define DRYSPONGE_PASS 0
+
+#define DRYSPONGE_DS 2
+#define DRYSPONGE_DD 1
+#define DRYSPONGE_DA 2
+#define DRYSPONGE_DM 3
+
+#define DRYSPONGE_STATESIZE (DRYSPONGE_CAPACITYSIZE+DRYSPONGE_BLOCKSIZE)
+#define DRYSPONGE_DIGESTSIZE (DRYSPONGE_KEYSIZE*2)
+#define DRYSPONGE_TAGSIZE DRYSPONGE_KEYSIZE
+#define DRYSPONGE_KEYMAXSIZE (DRYSPONGE_CAPACITYSIZE+DRYSPONGE_XSIZE)
+
+#define DRYSPONGE_DIVUP(a,b) (((a)+(b)-1)/(b))
+#define DRYSPONGE_ROTR32(x,n) (0xFFFFFFFF & (((x)>>(n))|((x)<<(0x1F & (32-(n))))))
+#define DRYSPONGE_ROTR64(x,n) (0xFFFFFFFFFFFFFFFF & (((x)>>(n))|((x)<<(0x3F & (64-(n))))))
+
+#define DRYSPONGE_STATESIZE32 DRYSPONGE_DIVUP(DRYSPONGE_STATESIZE,4)
+#define DRYSPONGE_CE_SIZE32 DRYSPONGE_DIVUP(DRYSPONGE_CE_SIZE,4)
+#define DRYSPONGE_BLOCKSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_BLOCKSIZE,4)
+#define DRYSPONGE_CAPACITYSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_CAPACITYSIZE,4)
+#define DRYSPONGE_XSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_XSIZE,4)
+#define DRYSPONGE_KEYSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_KEYSIZE,4)
+
+#define DRYSPONGE_STATESIZE64 DRYSPONGE_DIVUP(DRYSPONGE_STATESIZE,8)
+#define DRYSPONGE_CE_SIZE64 DRYSPONGE_DIVUP(DRYSPONGE_CE_SIZE,8)
+#define DRYSPONGE_BLOCKSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_BLOCKSIZE,8)
+#define DRYSPONGE_CAPACITYSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_CAPACITYSIZE,8)
+#define DRYSPONGE_XSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_XSIZE,8)
+#define DRYSPONGE_KEYSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_KEYSIZE,8)
+#define DRYSPONGE_TAGSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_TAGSIZE,8)
+#define DRYSPONGE_KEYMAXSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_KEYMAXSIZE,8)
+#define DRYSPONGE_NONCESIZE64 DRYSPONGE_DIVUP(DRYSPONGE_NONCESIZE,8)
+
+#if DRYSPONGE_NONCESIZE < 12
+    #error "DRYSPONGE_NONCESIZE < 12"
+#endif
+
+#if DRYSPONGE_KEYSIZE < 16
+    #error "DRYSPONGE_KEYSIZE < 16"
+#endif
+
+#if DRYSPONGE_DIGESTSIZE < 2*DRYSPONGE_KEYSIZE
+    #error "DRYSPONGE_DIGESTSIZE < 2*DRYSPONGE_KEYSIZE"
+#endif
+
+#if DRYSPONGE_ACCUMULATE_FACTOR > ((DRYSPONGE_CAPACITYSIZE/4)/DRYSPONGE_BLOCKSIZE32)
+    #error "DRYSPONGE_ACCUMULATE_FACTOR > ((DRYSPONGE_CAPACITYSIZE/4)/DRYSPONGE_BLOCKSIZE32)"
+#endif
+
+#ifdef DRYSPONGE_EXT
+#define DRYSPONGE_EXT_ARG (&(ctx->ext))
+#else
+#define DRYSPONGE_EXT_ARG 0
+#endif
+
+DRYSPONGE_FUNC unsigned int DRYSPONGE_DSINFO(unsigned int padded, unsigned int domain, unsigned int finalize){
+    #if DRYSPONGE_DBG_EN
+        bytes_utiles_printf("   Adding DS: padded=%d, domain=%u, finalize=%d\n",padded,domain,finalize);
+    #endif
+    return padded+(finalize<<1)+(domain<<2);
+}
+
+DRYSPONGE_FUNC uint32_t DRYSPONGE_rotr32(uint32_t x, unsigned int n){
+    assert(n<32);
+    return DRYSPONGE_ROTR32(x,n);
+}
+
+DRYSPONGE_FUNC uint64_t DRYSPONGE_rotr64(uint64_t x, unsigned int n){
+    assert(n<64);
+    return DRYSPONGE_ROTR64(x,n);
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_xor(
+    const uint8_t *const a,//exactly one block of input
+    const uint8_t *const b,
+    uint8_t *const y
+){
+    for(unsigned int i=0;i<DRYSPONGE_BLOCKSIZE;i++){
+        y[i] = a[i] ^ b[i];
+    }
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_load16(uint16_t* x, const uint8_t*const in) {
+    *x = 0;
+    for(unsigned int i = 0;i<2;i++){
+        uint16_t b = in[i];
+        *x = *x | (b<<(8*i));
+    }
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_load32(uint32_t* x, const uint8_t*const in) {
+    *x = 0;
+    for(unsigned int i = 0;i<4;i++){
+        uint32_t b = in[i];
+        *x = *x | (b<<(8*i));
+    }
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_store32(uint8_t* out, uint32_t x) {
+    for(unsigned int i = 0;i<4;i++){
+        out[i] = x >> (8*i);
+    }
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_load64(uint64_t* x, uint8_t* in) {
+    *x = 0;
+    for(unsigned int i = 0;i<8;i++){
+        uint64_t b = in[i];
+        *x = *x | (b<<(8*i));
+    }
+}
+
+DRYSPONGE_FUNC void DRYSPONGE_store64(uint8_t* out, uint64_t x) {
+    (void)DRYSPONGE_rotr32;
+    (void)DRYSPONGE_load16;
+    (void)DRYSPONGE_store32;
+    for(unsigned int i = 0;i<8;i++){
+        out[i] = x >> (8*i);
+    }
+}
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_dbg_support.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_dbg_support.h
+#ifndef __DRYSPONGE_DBG_SUPPORT_H__
+#define __DRYSPONGE_DBG_SUPPORT_H__
+
+#define DRYSPONGE_DBG_NONE 0
+#define DRYSPONGE_DBG_ALG_IO 1
+#define DRYSPONGE_DBG_F_IO 2
+#define DRYSPONGE_DBG_ROUND_IO 3
+#define DRYSPONGE_DBG_FULL 4
+
+#if DRYSPONGE_DBG_EN
+    #define DRYSPONGE_DBG(a) a;
+#else
+    #define DRYSPONGE_DBG(a)
+#endif
+
+
+#if DRYSPONGE_DBG_EN
+#include <assert.h>
+#include <stdio.h>
+#include "bytes_utils.h"
+static void DRYSPONGE_print_state(
+    DRYSPONGE_t *const ctx
+){
+    (void)xor_bytes;
+    (void)println_128;
+    (void)bytes_utils_remove_unused_warnings;
+    unsigned int linesize = 32;
+    if(linesize<DRYSPONGE_BLOCKSIZE) linesize = DRYSPONGE_BLOCKSIZE;
+    unsigned int remaining = DRYSPONGE_CAPACITYSIZE;
+    const uint8_t*const c = (const uint8_t*const)ctx->c;
+    for(unsigned int i=0;i<DRYSPONGE_DIVUP(DRYSPONGE_CAPACITYSIZE,linesize);i++){
+        bytes_utiles_printf(     "   C[%2u] = ",i);
+        unsigned int len = linesize < remaining ? linesize : remaining;
+        print_bytes_sep("",c+i*linesize,len,"\n","");
+        remaining -= len;
+    }
+    remaining = DRYSPONGE_XSIZE;
+    const uint8_t*const x = (const uint8_t*const)ctx->x;
+    for(unsigned int i=0;i<DRYSPONGE_DIVUP(DRYSPONGE_XSIZE,linesize);i++){
+        bytes_utiles_printf(     "   X[%2u] = ",i);
+        unsigned int len = linesize < remaining ? linesize : remaining;
+        print_bytes_sep("",x+i*linesize,len,"\n","");
+        remaining -= len;
+    }
+    print_bytes_sep("       R = ",ctx->r,DRYSPONGE_BLOCKSIZE,"\n","");
+}
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_le32.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_le32.h
+/**
+DrySponge
+Sebastien Riou, January 6th 2019
+c99 little endian 32 bit implementation meant to fit in the supercop framework
+
+Note: although this is faster than the ref implementation we noticed that it is
+still several times slower compared to what can be done with assembly.
+*/
+#ifndef __DRYSPONGE_H__
+#define __DRYSPONGE_H__
+
+#include "drysponge_common.h"
+
+//assume 32bit alignement is enough to access uint64_t since we target 32 bit CPU
+#define ALIGN64 4
+//#define ALIGN64 8
+
+#ifndef DRYSPONGE_DBG_EN
+#define DRYSPONGE_DBG_EN 0
+#endif
+
+typedef struct DRYSPONGE_struct_t {
+    uint64_t c[DRYSPONGE_CAPACITYSIZE64];
+    uint64_t r[DRYSPONGE_BLOCKSIZE64];
+    uint64_t x[DRYSPONGE_XSIZE64];
+    uint8_t *obuf;
+    uint64_t fcnt;
+    #ifdef DRYSPONGE_EXT
+    DRYSPONGE_EXT_t ext;
+    #endif
+    unsigned int rounds;
+} DRYSPONGE_t;
+
+#include "drysponge_dbg_support.h"
+
+static void DRYSPONGE_xor64(
+    const uint64_t *const a,//exactly one block of input
+    const uint64_t *const b,
+    uint64_t *const y
+){
+    for(unsigned int i=0;i<DRYSPONGE_BLOCKSIZE64;i++){
+        y[i] = a[i] ^ b[i];
+    }
+}
+
+//static void DRYSPONGE_xor32(
+//    const uint32_t *const a,//exactly one block of input
+//    const uint32_t *const b,
+//    uint32_t *const y
+//){
+//    for(unsigned int i=0;i<DRYSPONGE_BLOCKSIZE32;i++){
+//        y[i] = a[i] ^ b[i];
+//    }
+//}
+
+#ifdef DRYSPONGE_OPT_G
+void drygascon128_g(uint64_t* x, uint32_t rounds);
+static void DRYSPONGE_g(
+    DRYSPONGE_t *const ctx
+){
+    DRYSPONGE_OPT_G((uint64_t*)&(ctx->c),ctx->rounds);
+}
+#else
+static void DRYSPONGE_g(
+    DRYSPONGE_t *const ctx
+){
+    #if DRYSPONGE_DBG_EN
+        printf("   G entry %lu:\n",ctx->fcnt);
+        DRYSPONGE_print_state(ctx);
+    #endif
+    ctx->fcnt++;
+    DRYSPONGE_xor64(ctx->r,ctx->r,ctx->r);//r=0
+    for(unsigned int j = 0;j<ctx->rounds;j++){
+        #if DRYSPONGE_DBG_EN >= DRYSPONGE_DBG_ROUND_IO
+            printf("   CoreRound entry %d:\n",j);
+            DRYSPONGE_print_state(ctx);
+        #endif
+        DRYSPONGE_CoreRound(ctx,j);
+        uint32_t r32[DRYSPONGE_BLOCKSIZE32];
+        uint32_t cpart[DRYSPONGE_BLOCKSIZE32];
+	    memcpy(r32,ctx->r,sizeof(r32));
+        for(unsigned int k=0;k<DRYSPONGE_ACCUMULATE_FACTOR;k++){
+            memcpy(cpart,ctx->c+k*DRYSPONGE_BLOCKSIZE64,sizeof(cpart));
+            for(unsigned int i=0;i<DRYSPONGE_BLOCKSIZE32;i++){
+                r32[i]^=cpart[(i+k)%DRYSPONGE_BLOCKSIZE32];
+            }
+        }
+        memcpy(ctx->r,r32,sizeof(r32));
+    }
+}
+#endif
+
+#ifdef DRYSPONGE_OPT_F
+static void DRYSPONGE_DomainSeparator(
+    DRYSPONGE_EXT_t *const ext,
+    unsigned int dsinfo
+){
+    *ext = dsinfo;
+}
+void drygascon128_f(uint64_t* x, uint32_t*in,uint32_t ds,uint32_t rounds);
+static void DRYSPONGE_f(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const i
+){
+    DRYSPONGE_OPT_F((uint64_t*)&(ctx->c),(uint32_t*)i,(uint32_t)ctx->ext,ctx->rounds);
+    ctx->ext=0;
+}
+#else
+static void DRYSPONGE_f(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const i
+){
+    #if DRYSPONGE_DBG_EN
+        printf("   F entry %lu:\n",ctx->fcnt);
+        DRYSPONGE_print_state(ctx);
+        print_bytes_sep("       I = ",i,DRYSPONGE_BLOCKSIZE,"\n","");
+    #endif
+    DRYSPONGE_MixPhase(ctx,i);
+    #if DRYSPONGE_DBG_EN >= DRYSPONGE_DBG_ROUND_IO
+        printf("   After mix phase:\n");
+        DRYSPONGE_print_state(ctx);
+    #endif
+    DRYSPONGE_g(ctx);
+}
+#endif
+
+static void DRYSPONGE_set_key(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const key,
+    const unsigned int keylen
+){
+    assert(DRYSPONGE_KEYSIZE<=keylen);
+    const unsigned int midkeysize = DRYSPONGE_KEYSIZE+DRYSPONGE_XSIZE;
+    const unsigned int fullkeysize = DRYSPONGE_CAPACITYSIZE+DRYSPONGE_XSIZE;
+    if(DRYSPONGE_KEYSIZE!=keylen){//all words for x assumed to be different
+        if(fullkeysize == keylen){
+            memcpy(ctx->c,key,DRYSPONGE_CAPACITYSIZE);
+            memcpy(ctx->x,key+DRYSPONGE_CAPACITYSIZE,DRYSPONGE_XSIZE);
+        } else {
+            uint8_t c[DRYSPONGE_CAPACITYSIZE];
+            uint8_t x[DRYSPONGE_XSIZE];
+            assert(midkeysize==keylen);
+            for(unsigned int i=0;i<DRYSPONGE_CAPACITYSIZE;i++){
+                c[i] = key[i%DRYSPONGE_KEYSIZE];
+            }
+            for(unsigned int i=0;i<DRYSPONGE_XSIZE;i++){
+                x[i] = key[DRYSPONGE_KEYSIZE+i];
+            }
+            memcpy(ctx->c,c,DRYSPONGE_CAPACITYSIZE);
+            memcpy(ctx->x,x,DRYSPONGE_XSIZE);
+        }
+    }else{
+        uint8_t c[DRYSPONGE_CAPACITYSIZE];
+        for(unsigned int i=0;i<DRYSPONGE_CAPACITYSIZE;i++){
+            c[i] = key[i%DRYSPONGE_KEYSIZE];
+        }
+        memcpy(ctx->c,c,DRYSPONGE_CAPACITYSIZE);
+        DRYSPONGE_CoreRound(ctx,0);
+        //need to fixup x such that all words are different
+        unsigned int modified=1;
+        while(modified){
+            uint32_t c32[DRYSPONGE_CAPACITYSIZE32];
+            memcpy(c32,ctx->c,DRYSPONGE_CAPACITYSIZE);
+            modified=0;
+            for(unsigned int i=0;i<DRYSPONGE_XSIZE32-1;i++){
+                for(unsigned int j=i+1;j<DRYSPONGE_XSIZE32;j++){
+                    if(c32[i]==c32[j]){
+                        DRYSPONGE_CoreRound(ctx,0);
+                        modified=1;
+                        break;
+                    }
+                }
+                if(modified) break;
+            }
+        }
+        memcpy(ctx->x,ctx->c,DRYSPONGE_XSIZE);
+        memcpy(ctx->c,key,DRYSPONGE_XSIZE);
+    }
+    uint32_t x32[DRYSPONGE_XSIZE32];// = (uint32_t *const)ctx->x;
+    memcpy(x32,ctx->x,DRYSPONGE_XSIZE);
+    //sanity check: all words in x shall be different
+    for(unsigned int i=0;i<DRYSPONGE_XSIZE32-1;i++){
+        for(unsigned int j=i+1;j<DRYSPONGE_XSIZE32;j++){
+            assert(x32[i]!=x32[j]);
+        }
+    }
+}
+
+static unsigned int DRYSPONGE_padding(
+    const uint8_t *const ib,//one block of input or less
+    uintptr_t iblen,
+    uint8_t *const ob//exactly one block
+){
+    assert(iblen<=DRYSPONGE_BLOCKSIZE);
+    memcpy(ob,ib,iblen);
+    unsigned int padded = 0;
+    if(iblen<DRYSPONGE_BLOCKSIZE){
+        ob[iblen] = 0x01;
+        if(iblen+1<DRYSPONGE_BLOCKSIZE){
+            memset(ob+iblen+1,0,DRYSPONGE_BLOCKSIZE-iblen-1);
+        }
+        padded = 1;
+    }
+    return padded;
+}
+
+static void DRYSPONGE_absorb_only(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const ad,
+    size_t alen,
+    unsigned int ds,
+    unsigned int finalize
+){
+    const uint8_t *iad = ad;
+    size_t a = (alen + DRYSPONGE_BLOCKSIZE - 1) / DRYSPONGE_BLOCKSIZE;
+    if(a){
+        for(size_t i = 0; i<a-1; i++){//process all blocks except last one
+            DRYSPONGE_f(ctx,iad);
+            iad+=DRYSPONGE_BLOCKSIZE;
+        }
+    }
+    uint8_t last_block[DRYSPONGE_BLOCKSIZE];
+    uintptr_t remaining = ad+alen-iad;
+    uint8_t apad = DRYSPONGE_padding(iad,remaining,last_block);
+    DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(apad,ds,finalize));
+    DRYSPONGE_f(ctx,last_block);
+}
+
+static void DRYSPONGE_squeez_only(
+    DRYSPONGE_t *const ctx,
+    uint8_t *out,
+    unsigned int remaining
+){
+    while(remaining){
+        unsigned int len = remaining > DRYSPONGE_BLOCKSIZE ? DRYSPONGE_BLOCKSIZE : remaining;
+        memcpy(out,ctx->r,len);
+        out+=len;
+        remaining-=len;
+        if(remaining){
+            DRYSPONGE_g(ctx);
+        }
+    }
+}
+
+static void DRYSPONGE_init_ctx(
+    DRYSPONGE_t *const ctx
+){
+    #ifdef DRYSPONGE_EXT
+    memset(DRYSPONGE_EXT_ARG,0,sizeof(DRYSPONGE_EXT_t));
+    #endif
+    ctx->fcnt=0;
+    memset(ctx->r,0x00,DRYSPONGE_BLOCKSIZE);
+}
+
+static void DRYSPONGE_hash(
+    const uint8_t *const message,
+    const size_t mlen,
+    uint8_t *const digest
+){
+    DRYSPONGE_t ctx_storage;
+    DRYSPONGE_t *const ctx = &ctx_storage;
+    DRYSPONGE_init_ctx(ctx);
+    ctx->rounds=DRYSPONGE_ROUNDS;
+    #if DRYSPONGE_DBG_EN
+        printf("Hashing %lu bytes message: ",mlen);
+        print_bytes_sep("",message,mlen,"\n","");
+    #endif
+    const uint64_t CST_H[] = {
+        0xd308a385886a3f24,
+        0x447370032e8a1913,
+        0xd0319f29223809a4,
+        0x896c4eec98fa2e08,
+        0x7713d038e6212845,
+        0x6c0ce934cf6654be,
+        0xdd507cc9b729acc0,
+        0x170947b5b5d5843f,
+        0x1bfb7989d9d51692,
+        0xacb5df98a60b31d1,
+        0xb7df1ad0db72fd2f,
+        0x967e266aedafe1b8,
+        0x997f2cf145907cba,
+        0xf76c91b34799a124,
+        0x16fc8e85e2f20108,
+        0x694e5771d8206963,
+    };
+    DRYSPONGE_set_key(ctx,(const uint8_t*)CST_H,DRYSPONGE_KEYSIZE+DRYSPONGE_XSIZE);
+    DRYSPONGE_absorb_only(ctx,message,mlen,DRYSPONGE_DS,1);
+    DRYSPONGE_squeez_only(ctx,digest,DRYSPONGE_DIGESTSIZE);
+    #if DRYSPONGE_DBG_EN
+        printf("   Final state:\n");
+        DRYSPONGE_print_state(ctx);
+        print_bytes_sep("   Digest: ",digest,DRYSPONGE_DIGESTSIZE,"\n","");
+    #endif
+}
+
+static void DRYSPONGE_init(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const key,
+    const unsigned int klen,
+    const uint8_t *const nonce,
+    uint8_t *out_buffer,//output buffer
+    unsigned int finalize
+){
+    DRYSPONGE_init_ctx(ctx);
+    ctx->rounds=DRYSPONGE_ROUNDS;
+    DRYSPONGE_set_key(ctx,key,klen);
+    ctx->obuf = out_buffer;
+    DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(0,DRYSPONGE_DD,finalize));
+    ctx->rounds=DRYSPONGE_INIT_ROUNDS;
+    #if DRYSPONGE_NONCESIZE>DRYSPONGE_BLOCKSIZE
+        assert(0==(DRYSPONGE_NONCESIZE%DRYSPONGE_BLOCKSIZE));
+        unsigned int nloops = DRYSPONGE_DIVUP(DRYSPONGE_NONCESIZE,DRYSPONGE_BLOCKSIZE);
+        for(unsigned int i=0;i<nloops-1;i++){
+            DRYSPONGE_f(ctx,nonce+i*DRYSPONGE_BLOCKSIZE);
+        }
+        DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(0,DRYSPONGE_DD,finalize));
+        DRYSPONGE_f(ctx,nonce+(nloops-1)*DRYSPONGE_BLOCKSIZE);
+    #else
+        uint8_t block[DRYSPONGE_BLOCKSIZE] = {0};
+        memcpy(block,nonce,DRYSPONGE_NONCESIZE);
+        DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(0,DRYSPONGE_DD,finalize));
+        DRYSPONGE_f(ctx,block);
+    #endif
+    ctx->rounds=DRYSPONGE_ROUNDS;
+}
+
+static void DRYSPONGE_enc_core(
+    DRYSPONGE_t *const ctx,
+    const uint64_t *const ib//exactly one block of input
+){
+
+    DRYSPONGE_xor((uint8_t *)ctx->r,(uint8_t *)ib,ctx->obuf);
+    DRYSPONGE_f(ctx,(uint8_t *)ib);
+    ctx->obuf+=DRYSPONGE_BLOCKSIZE;
+}
+
+static void DRYSPONGE_enc_core_aligned(
+    DRYSPONGE_t *const ctx,
+    const uint64_t *const ib//exactly one block of input
+){
+    assert((((uintptr_t)ctx->obuf)%8) == 0);
+    DRYSPONGE_xor64(ctx->r,ib,(uint64_t*const)ctx->obuf);
+    DRYSPONGE_f(ctx,(uint8_t *)ib);
+    ctx->obuf+=DRYSPONGE_BLOCKSIZE;
+}
+
+static const uint8_t* DRYSPONGE_enc_blocks(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *im,//whole message
+    size_t m
+){
+    (void)DRYSPONGE_load32;
+    (void)DRYSPONGE_store32;
+    (void)DRYSPONGE_load64;
+    (void)DRYSPONGE_store64;
+    uint64_t buf64[DRYSPONGE_BLOCKSIZE64];
+    const uint64_t *ib64;
+    #if DRYSPONGE_BLOCKSIZE % ALIGN64
+        unsigned int input_aligned = 0;
+        unsigned int output_aligned = 0;
+    #else
+        unsigned int input_aligned = 0==(((uintptr_t)im)%ALIGN64);
+        unsigned int output_aligned = 0==(((uintptr_t)ctx->obuf)%ALIGN64);
+    #endif
+    if(input_aligned && output_aligned){
+        for(size_t i = 0; i<m; i++){
+            ib64 = (const uint64_t*)im;
+            DRYSPONGE_enc_core_aligned(ctx,ib64);
+            im+=DRYSPONGE_BLOCKSIZE;
+        }
+    }else{
+        ib64 = buf64;
+        for(size_t i = 0; i<m; i++){
+            memcpy(buf64,im,DRYSPONGE_BLOCKSIZE);
+            DRYSPONGE_enc_core(ctx,ib64);//input is now aligned but output may not
+            im+=DRYSPONGE_BLOCKSIZE;
+        }
+    }
+    return im;
+}
+
+static void DRYSPONGE_dec_core(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *const ib//exactly one block of input
+){
+    DRYSPONGE_xor((uint8_t *)ctx->r,ib,ctx->obuf);
+    DRYSPONGE_f(ctx,ctx->obuf);
+    ctx->obuf+=DRYSPONGE_BLOCKSIZE;
+}
+
+static void DRYSPONGE_dec_core_aligned(
+    DRYSPONGE_t *const ctx,
+    const uint64_t *const ib//exactly one block of input
+){
+    DRYSPONGE_xor64(ctx->r,ib,(uint64_t*const)ctx->obuf);
+    DRYSPONGE_f(ctx,ctx->obuf);
+    ctx->obuf+=DRYSPONGE_BLOCKSIZE;
+}
+
+static const uint8_t* DRYSPONGE_dec_blocks(
+    DRYSPONGE_t *const ctx,
+    const uint8_t *im,//whole message
+    size_t m
+){
+    const uint64_t *ib64;
+    #if DRYSPONGE_BLOCKSIZE % ALIGN64
+        unsigned int input_aligned = 0;
+        unsigned int output_aligned = 0;
+    #else
+        unsigned int input_aligned = 0==(((uintptr_t)im)%ALIGN64);
+        unsigned int output_aligned = 0==(((uintptr_t)ctx->obuf)%ALIGN64);
+    #endif
+    if(input_aligned && output_aligned){
+        for(size_t i = 0; i<m; i++){
+            ib64 = (const uint64_t*)im;
+            DRYSPONGE_dec_core_aligned(ctx,ib64);
+            im+=DRYSPONGE_BLOCKSIZE;
+        }
+    }else{
+        for(size_t i = 0; i<m; i++){
+            DRYSPONGE_dec_core(ctx,im);
+            im+=DRYSPONGE_BLOCKSIZE;
+        }
+    }
+    return im;
+}
+
+static void DRYSPONGE_enc(
+    const uint8_t *const key,
+    const unsigned int klen,
+    const uint8_t *const nonce,
+    const uint8_t *const message,
+    const size_t mlen,
+    const uint8_t * const ad,
+    const size_t alen,
+    uint8_t *ciphertext,
+    size_t *clen
+){
+    const uint8_t *im = message;
+    DRYSPONGE_t ctx_storage;
+    DRYSPONGE_t *const ctx = &ctx_storage;
+    unsigned int finalize = (mlen|alen) ? 0 : 1;
+    DRYSPONGE_init(
+        ctx,
+        key,
+        klen,
+        nonce,
+        ciphertext,
+        finalize
+    );
+    if(alen){
+        finalize = mlen ? 0 : 1;
+        DRYSPONGE_absorb_only(ctx,ad,alen,DRYSPONGE_DA,finalize);
+    }
+    if(mlen){
+        size_t m = (mlen + DRYSPONGE_BLOCKSIZE - 1) / DRYSPONGE_BLOCKSIZE;
+        im=DRYSPONGE_enc_blocks(ctx,im,m-1);//process all blocks except the last one
+        uint64_t last_block64[DRYSPONGE_BLOCKSIZE64];
+        uint8_t*last_block=(uint8_t*)last_block64;
+        unsigned int remaining = message+mlen-im;
+        uint8_t mpad = DRYSPONGE_padding(im,remaining,last_block);
+        DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(mpad,DRYSPONGE_DM,1));
+        DRYSPONGE_enc_core(ctx,last_block64);//writing full block is fine since we still have the area reserved for the tag
+        ctx->obuf = ciphertext + mlen;//fix the size
+    }
+    DRYSPONGE_squeez_only(ctx,ctx->obuf,DRYSPONGE_TAGSIZE);
+    *clen = mlen+DRYSPONGE_TAGSIZE;
+    #if DRYSPONGE_DBG_EN
+        printf("   Final state:\n");
+        DRYSPONGE_print_state(ctx);
+        print_bytes_sep("   CipherText: ",ciphertext,*clen,"\n","");
+    #endif
+}
+
+//WARNING the function writes plaintext into "message" before checking the tag.
+//It is the responsability of the caller to ensure that the "message" buffer is
+//not accessible by anything until this function has return.
+static int DRYSPONGE_dec(
+    const uint8_t *const key,
+    const unsigned int klen,
+    const uint8_t *const nonce,
+    const uint8_t *const ciphertext,
+    const size_t clen,
+    const uint8_t * const ad,
+    const size_t alen,
+    uint8_t *message
+){
+    if(clen<DRYSPONGE_TAGSIZE) return -1;
+    size_t mlen = clen - DRYSPONGE_TAGSIZE;
+    const uint8_t *im = ciphertext;
+    DRYSPONGE_t ctx_storage;
+    DRYSPONGE_t *const ctx = &ctx_storage;
+    unsigned int finalize = (mlen|alen) ? 0 : 1;
+    DRYSPONGE_init(
+        ctx,
+        key,
+        klen,
+        nonce,
+        message,
+        finalize
+    );
+    if(alen){
+        finalize = mlen ? 0 : 1;
+        DRYSPONGE_absorb_only(ctx,ad,alen,DRYSPONGE_DA,finalize);
+    }
+    if(mlen){
+        size_t m = (mlen + DRYSPONGE_BLOCKSIZE - 1) / DRYSPONGE_BLOCKSIZE;
+        im=DRYSPONGE_dec_blocks(ctx,im,m-1);
+        uint64_t last_block64[DRYSPONGE_BLOCKSIZE64];
+        uint8_t*last_block=(uint8_t*)last_block64;
+        unsigned int remaining = ciphertext+mlen-im;
+        memcpy(last_block,im,remaining);
+        DRYSPONGE_xor64(ctx->r,last_block64,last_block64);
+        uint8_t mpad = DRYSPONGE_padding(last_block,remaining,last_block);
+        im+=remaining;
+        DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(mpad,DRYSPONGE_DM,1));
+        memcpy(ctx->obuf,last_block,remaining);
+        DRYSPONGE_f(ctx,last_block);
+    }
+    uint64_t tag64[DRYSPONGE_TAGSIZE64];
+    uint8_t*tag = (uint8_t*)tag64;
+    DRYSPONGE_squeez_only(ctx,tag,DRYSPONGE_TAGSIZE);
+    DRYSPONGE_DBG(print_bytes_sep("expected tag=",im,DRYSPONGE_TAGSIZE,"\n",""));
+    DRYSPONGE_DBG(print_bytes_sep("computed tag=",tag,DRYSPONGE_TAGSIZE,"\n",""));
+    if(memcmp(tag,im,DRYSPONGE_TAGSIZE)){
+        memset(message,0,mlen);//erase all output
+        return ~DRYSPONGE_PASS;
+    }
+    return DRYSPONGE_PASS;
+}
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/encrypt.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/encrypt.c
+#include "crypto_aead.h"
+#define DRYSPONGE_OPT_G drygascon128_g
+#define DRYSPONGE_OPT_F drygascon128_f
+#include "drysponge.h"
+
+/**
+generating a ciphertext c[0],c[1],...,c[*clen-1]
+from a plaintext m[0],m[1],...,m[mlen-1]
+and associated data ad[0],ad[1],...,ad[adlen-1]
+and nonce npub[0],npub[1],...
+and secret key k[0],k[1],...
+the implementation shall not use nsec
+*/
+int crypto_aead_encrypt(
+    unsigned char *c,unsigned long long *clen,
+    const unsigned char *m,unsigned long long mlen,
+    const unsigned char *ad,unsigned long long adlen,
+    const unsigned char *nsec,
+    const unsigned char *npub,
+    const unsigned char *k
+){
+    (void) nsec; //avoid warning
+    (void) DRYSPONGE_hash; //avoid warning
+    size_t impl_clen;
+    DRYSPONGE_enc(k,DRYSPONGE_KEYSIZE,npub,m,mlen,ad,adlen,c,&impl_clen);
+    *clen = impl_clen;
+    return 0;
+}
+
+/**
+the code for the AEAD implementation goes here,
+generating a plaintext m[0],m[1],...,m[*mlen-1]
+and secret message number nsec[0],nsec[1],...
+from a ciphertext c[0],c[1],...,c[clen-1]
+and associated data ad[0],ad[1],...,ad[adlen-1]
+and nonce number npub[0],npub[1],...
+and secret key k[0],k[1],...
+*/
+int crypto_aead_decrypt(
+    unsigned char *m,unsigned long long *mlen,
+    unsigned char *nsec,
+    const unsigned char *c,unsigned long long clen,
+    const unsigned char *ad,unsigned long long adlen,
+    const unsigned char *npub,
+    const unsigned char *k
+){
+    (void) nsec; //avoid warning
+    if(DRYSPONGE_PASS!=DRYSPONGE_dec(k,DRYSPONGE_KEYSIZE,npub,c,clen,ad,adlen,m))
+        return -1;
+    *mlen = clen - DRYSPONGE_TAGSIZE;
+    return 0;
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/nistlwc
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/nistlwc
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/api.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/api.h
+#define CRYPTO_KEYBYTES     16
+#define CRYPTO_NSECBYTES    0
+#define CRYPTO_NPUBBYTES    16
+#define CRYPTO_ABYTES       16
+#define CRYPTO_NOOVERLAP    1
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/cofb.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/cofb.h
+#ifndef GIFT_COFB_H_
+#define GIFT_COFB_H_
+
+#define TAG_SIZE        16
+#define COFB_ENCRYPT    1
+#define COFB_DECRYPT    0
+
+#define XOR_BLOCK(x, y, z) ({       \
+    (x)[0] = (y)[0] ^ (z)[0];       \
+    (x)[1] = (y)[1] ^ (z)[1];       \
+    (x)[2] = (y)[2] ^ (z)[2];       \
+    (x)[3] = (y)[3] ^ (z)[3];       \
+})
+
+#define XOR_TOP_BAR_BLOCK(x, y) ({  \
+    (x)[0] ^= (y)[0];               \
+    (x)[1] ^= (y)[1];               \
+})
+
+#endif // GIFT_COFB_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/encrypt.c
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/encrypt.c
+#include <string.h>
+#include "api.h"
+#include "cofb.h"
+#include "giftb128.h"
+
+static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){
+    u32 i;
+    if (no_of_bytes == 0) {
+        d[0] = 0x00000080; // little-endian
+        d[1] = 0x00000000;
+        d[2] = 0x00000000;
+        d[3] = 0x00000000;
+    }
+    else if (no_of_bytes < GIFT128_BLOCK_SIZE) {
+        for (i = 0; i < no_of_bytes/4+1; i++)
+            d[i] = s[i];
+        d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8);
+        d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8;
+        for (; i < 4; i++)
+            d[i] = 0x00000000;
+    }
+    else {
+        d[0] = s[0];
+        d[1] = s[1];
+        d[2] = s[2];
+        d[3] = s[3];
+    }
+}
+
+static inline void double_half_block(u32* x) {
+    u32 tmp0;
+    tmp0 = (x)[0];
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;
+}
+
+static inline void triple_half_block(u32* x) {
+    u32 tmp0, tmp1;
+    tmp0 = (x)[0];
+    tmp1 = (x)[1];
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;
+    (x)[0] ^= tmp0;
+    (x)[1] ^= tmp1;
+}
+
+static inline void g(u32 *x) {
+    u32 tmp0, tmp1;
+    tmp0 = (x)[0];
+    tmp1 = (x)[1];
+    (x)[0] = (x)[2];
+    (x)[1] = (x)[3];
+    (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15);
+    (x)[2] |= ((tmp1 & 0x80808080) << 17);
+    (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15);
+    (x)[3] |= ((tmp0 & 0x80808080) << 17);
+}
+
+static inline void rho1(u32* d, u32* y, u32* m, u32 n) {
+    g(y);
+    padding(d,m,n);
+    XOR_BLOCK(d, d, y);
+}
+
+static inline void rho(u32* y, u32* m, u32* x, u32* c, u32 n) {
+    XOR_BLOCK(c, y, m);
+    rho1(x, y, m, n);
+}
+
+static inline void rho_prime(u32* y, u32*c, u32* x, u32* m, u32 n) {
+    XOR_BLOCK(m, y, c);
+    rho1(x, y, m, n);
+}
+
+/****************************************************************************
+* Constant-time implementation of the GIFT-COFB authenticated cipher based on
+* fixsliced GIFTb-128. Encryption/decryption is handled by the same function,
+* depending on the 'mode' parameter (1/0).
+****************************************************************************/
+int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad,
+                u32 ad_len, const u8* in, u32 in_len, const int encrypting) {
+
+    u32 tmp0, tmp1, emptyA, emptyM, offset[2];
+    u32 input[4], rkey[80];
+    u8 Y[GIFT128_BLOCK_SIZE];
+
+    if (!encrypting) {
+        if (in_len < TAG_SIZE)
+            return -1;
+        in_len -= TAG_SIZE;
+    }
+
+    if(ad_len == 0)
+        emptyA = 1;
+    else
+        emptyA = 0;
+
+    if(in_len == 0)
+        emptyM =1;
+    else
+        emptyM = 0;
+
+    gift128_keyschedule(key, rkey);
+    giftb128_encrypt_block(Y, rkey, nonce);
+    offset[0] = ((u32*)Y)[0];
+    offset[1] = ((u32*)Y)[1];
+
+    while(ad_len > GIFT128_BLOCK_SIZE){
+        rho1(input, (u32*)Y, (u32*)ad, GIFT128_BLOCK_SIZE);
+        double_half_block(offset);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+        ad += GIFT128_BLOCK_SIZE;
+        ad_len -= GIFT128_BLOCK_SIZE;
+    }
+    
+    triple_half_block(offset);
+    if((ad_len % GIFT128_BLOCK_SIZE != 0) || (emptyA))
+        triple_half_block(offset);
+    if(emptyM) {
+        triple_half_block(offset);
+        triple_half_block(offset);
+    }
+
+    rho1(input, (u32*)Y, (u32*)ad, ad_len);
+    XOR_TOP_BAR_BLOCK(input, offset);
+    giftb128_encrypt_block(Y, rkey, (u8*)input);
+
+    while (in_len > GIFT128_BLOCK_SIZE){
+        double_half_block(offset);
+        if (encrypting)
+            rho((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE);
+        else
+            rho_prime((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+        in += GIFT128_BLOCK_SIZE;
+        out += GIFT128_BLOCK_SIZE;
+        in_len -= GIFT128_BLOCK_SIZE;
+    }
+    
+    if(!emptyM){
+        triple_half_block(offset);
+        if(in_len % GIFT128_BLOCK_SIZE != 0)
+            triple_half_block(offset);
+        if (encrypting) {
+            rho((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            out += in_len;
+        }
+        else {
+            rho_prime((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            in += in_len;
+        }
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+    }
+    
+    if (encrypting) { // encryption mode
+        memcpy(out, Y, TAG_SIZE);
+        return 0;
+    }
+    // decrypting
+    tmp0 = 0;
+    for(tmp1 = 0; tmp1 < TAG_SIZE; tmp1++)
+        tmp0 |= in[tmp1] ^ Y[tmp1];
+    return tmp0;
+}
+
+int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
+                    const unsigned char* m, unsigned long long mlen,
+                    const unsigned char* ad, unsigned long long adlen,
+                    const unsigned char* nsec, const unsigned char* npub,
+                    const unsigned char* k) {
+    (void)nsec;
+    *clen = mlen + TAG_SIZE;
+    return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT);
+}
+
+int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen,
+                    unsigned char* nsec, const unsigned char* c,
+                    unsigned long long clen, const unsigned char* ad,
+                    unsigned long long adlen, const unsigned char* npub,
+                    const unsigned char *k) {
+    (void)nsec;
+    *mlen = clen - TAG_SIZE;
+    return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT);
+}
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.h
+#ifndef GIFT128_H_
+#define GIFT128_H_
+
+#define KEY_SIZE    		16
+#define GIFT128_BLOCK_SIZE  16
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+extern void gift128_keyschedule(const u8* key, u32* rkey);
+extern void giftb128_encrypt_block(u8* out_block, const u32* rkey, const u8* in_block);
+
+#endif  // GIFT128_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.s
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.s
+/****************************************************************************
+* Balanced ARM assembly implementation of the GIFT-128 block cipher. This
+* implementation provides efficiency with limited impact on the code size.
+* See "Fixslicing: A New GIFT Representation" paper available at 
+* https:// for more details.
+****************************************************************************/
+
+.syntax unified
+.thumb
+
+/*****************************************************************************
+* Round constants look-up table according to the fixsliced representation.
+*****************************************************************************/
+.align 2
+.type rconst,%object
+rconst:
+.word 0x10000008, 0x80018000, 0x54000002, 0x01010181
+.word 0x8000001f, 0x10888880, 0x6001e000, 0x51500002
+.word 0x03030180, 0x8000002f, 0x10088880, 0x60016000
+.word 0x41500002, 0x03030080, 0x80000027, 0x10008880
+.word 0x4001e000, 0x11500002, 0x03020180, 0x8000002b
+.word 0x10080880, 0x60014000, 0x01400002, 0x02020080
+.word 0x80000021, 0x10000080, 0x0001c000, 0x51000002
+.word 0x03010180, 0x8000002e, 0x10088800, 0x60012000
+.word 0x40500002, 0x01030080, 0x80000006, 0x10008808
+.word 0xc001a000, 0x14500002, 0x01020181, 0x8000001a
+
+.align 2
+classical_key_update:
+    and     r2, r10, r7, lsr #12
+    and     r3, r7, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r7, lsr #2
+    orr     r2, r2, r3
+    and     r7, r7, #0x00030000
+    orr     r7, r2, r7, lsl #14
+    str.w   r7, [r1, #4]            //1st classical key update
+    str.w   r5, [r1], #8            //1st classical key update
+    and     r2, r10, r6, lsr #12
+    and     r3, r6, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r6, lsr #2
+    orr     r2, r2, r3
+    and     r6, r6, #0x00030000
+    orr     r6, r2, r6, lsl #14
+    str.w   r6, [r1, #4]            //2nd classical key update
+    str.w   r4, [r1], #8            //2nd classical key update
+    and     r2, r10, r5, lsr #12
+    and     r3, r5, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r5, lsr #2
+    orr     r2, r2, r3
+    and     r5, r5, #0x00030000
+    orr     r5, r2, r5, lsl #14
+    str.w   r5, [r1, #4]            //3rd classical key update
+    str.w   r7, [r1], #8            //3rd classical key update
+    and     r2, r10, r4, lsr #12
+    and     r3, r4, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r4, lsr #2
+    orr     r2, r2, r3
+    and     r4, r4, #0x00030000
+    orr     r4, r2, r4, lsl #14
+    str.w   r4, [r1, #4]            //4th classical key update
+    str.w   r6, [r1], #8            //4th classical key update
+    bx      lr
+
+.align 2
+rearrange_rkey_0:
+    ldrd    r6, r4, [r1]
+    eor     r12, r6, r6, lsr #9
+    and     r12, r12, r3
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #9     //SWAPMOVE(r6, r6, 0x00550055, 9);
+    eor     r12, r4, r4, lsr #9
+    and     r12, r12, r3
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #9     //SWAPMOVE(r4, r4, 0x00550055, 9);
+    eor     r12, r6, r6, lsr #18
+    and     r12, r12, r10
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #18    //SWAPMOVE(r6, r6, 0x3333, 18);
+    eor     r12, r4, r4, lsr #18
+    and     r12, r12, r10
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #18    //SWAPMOVE(r4, r4, 0x3333, 18);
+    eor     r12, r6, r6, lsr #12
+    and     r12, r12, r11
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #12    //SWAPMOVE(r6, r6, 0x000f000f, 12);
+    eor     r12, r4, r4, lsr #12
+    and     r12, r12, r11
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #12    //SWAPMOVE(r4, r4, 0x000f000f, 12);
+    eor     r12, r6, r6, lsr #24
+    and     r12, r12, #0xff
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #24    //SWAPMOVE(r6, r6, 0x000000ff, 24);
+    eor     r12, r4, r4, lsr #24
+    and     r12, r12, #0xff
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #24    //SWAPMOVE(r4, r4, 0x000000ff, 24);
+    str.w   r6, [r1]
+    str.w   r4, [r1, #4]
+    bx      lr
+
+.align 2
+rearrange_rkey_1:
+    ldrd    r5, r7, [r1]
+    eor     r8, r7, r7, lsr #3
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #3      //SWAPMOVE(r7, r7, 0x11111111, 3);
+    eor     r8, r5, r5, lsr #3
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #3      //SWAPMOVE(r5, r5, 0x11111111, 3);
+    eor     r8, r7, r7, lsr #6
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #6      //SWAPMOVE(r7, r7, 0x03030303, 6);
+    eor     r8, r5, r5, lsr #6
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #6      //SWAPMOVE(r5, r5, 0x03030303, 6);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    str.w   r5, [r1]
+    str.w   r7, [r1, #4]
+    bx      lr
+
+.align 2
+rearrange_rkey_2:
+    ldrd    r5, r7, [r1]
+    eor     r8, r7, r7, lsr #15
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #15     //SWAPMOVE(r7, r7, 0x0000aaaa, 15);
+    eor     r8, r5, r5, lsr #15
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #15     //SWAPMOVE(r5, r5, 0x0000aaaa, 15);
+    eor     r8, r7, r7, lsr #18
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #18     //SWAPMOVE(r7, r7, 0x00003333, 18);
+    eor     r8, r5, r5, lsr #18
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #18     //SWAPMOVE(r5, r5, 0x00003333, 18);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x00000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    str.w   r5, [r1]
+    str.w   r7, [r1, #4]
+    bx      lr
+
+.align 2
+rearrange_rkey_3:
+    ldrd    r5, r7, [r1]
+    eor     r8, r7, r7, lsr #3
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #3      //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3);
+    eor     r8, r5, r5, lsr #3
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #3      //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3);
+    eor     r8, r7, r7, lsr #6
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #6      //SWAPMOVE(r7, r7, 0x00cc00cc, 6);
+    eor     r8, r5, r5, lsr #6
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #6      //SWAPMOVE(r5, r5, 0x00cc00cc, 6);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    str.w   r5, [r1]
+    str.w   r7, [r1, #4]
+    bx      lr
+
+.align 2
+key_update_0:
+    ldrd    r4, r5, [r1], #80
+    and     r2, r12, r4, ror #24
+    and     r4, r4, r11
+    orr     r4, r2, r4, ror #16     //KEY_TRIPLE_UPDATE_1(r4)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r8
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x55551100, 1)
+    eor     r2, r5, r5, lsr #16
+    and     r2, r2, r10
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #16     //SWAPMOVE(r5, r5, 0x00003333, 16)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r9
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x555544444, 1)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    and     r2, r12, r5, ror #24
+    and     r5, r5, r11
+    orr     r5, r2, r5, ror #16     //KEY_TRIPLE_UPDATE_1(r5)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r8
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x55551100, 1)
+    eor     r2, r4, r4, lsr #16
+    and     r2, r2, r10
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #16     //SWAPMOVE(r4, r4, 0x00003333, 16)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r9
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x555544444, 1)
+    str.w   r5, [r1, #4]
+    str.w   r4, [r1], #80
+    and     r2, r12, r4, ror #24
+    and     r4, r4, r11
+    orr     r4, r2, r4, ror #16     //KEY_TRIPLE_UPDATE_1(r4)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r8
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x55551100, 1)
+    eor     r2, r5, r5, lsr #16
+    and     r2, r2, r10
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #16     //SWAPMOVE(r5, r5, 0x00003333, 16)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r9
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x555544444, 1)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    bx      lr
+
+.align 2
+key_update_1:
+    ldrd    r4, r5, [r1], #80
+    and     r2, r9, r4, lsr #6
+    and     r3, r4, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #5
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r12, r5, lsr #4
+    and     r3, r5, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r5, lsr #6
+    orr     r2, r2, r3
+    and     r5, r5, r10
+    orr     r5, r2, r5, lsl #2      //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    and     r2, r9, r5, lsr #6
+    and     r3, r5, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #5
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_TRIPLE_UPDATE_2(r5)
+    and     r2, r12, r4, lsr #4
+    and     r3, r4, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r4, lsr #6
+    orr     r2, r2, r3
+    and     r4, r4, r10
+    orr     r4, r2, r4, lsl #2      //KEY_DOUBLE_UPDATE_2(r4)
+    str.w   r5, [r1, #4]
+    str.w   r4, [r1], #80
+    and     r2, r9, r4, lsr #6
+    and     r3, r4, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #5
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r12, r5, lsr #4
+    and     r3, r5, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r5, lsr #6
+    orr     r2, r2, r3
+    and     r5, r5, r10
+    orr     r5, r2, r5, lsl#2       //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    bx      lr
+
+.align 2
+key_update_2:
+    ldrd    r4, r5, [r1], #80
+    and     r2, r12, r4, ror #24
+    and     r4, r11, r4, ror #20
+    orr     r4, r4, r2              //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r11, r5, ror #24
+    and     r5, r12, r5, ror #16
+    orr     r5, r5, r2              //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    and     r2, r12, r5, ror #24
+    and     r5, r11, r5, ror #20
+    orr     r5, r5, r2              //KEY_TRIPLE_UPDATE_2(r5)
+    and     r2, r11, r4, ror #24
+    and     r4, r12, r4, ror #16
+    orr     r4, r4, r2              //KEY_DOUBLE_UPDATE_2(r4)
+    str.w   r5, [r1, #4]
+    str.w   r4, [r1], #80
+    and     r2, r12, r4, ror #24
+    and     r4, r11, r4, ror #20
+    orr     r4, r4, r2              //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r11, r5, ror #24
+    and     r5, r12, r5, ror #16
+    orr     r5, r5, r2              //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    bx      lr
+
+.align 2
+key_update_3:
+    ldrd    r4, r5, [r1], #80
+    and     r2, r10, r4, lsr #18
+    and     r3, r4, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r4, lsr #14
+    orr     r2, r2, r3
+    and     r3, r4, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7, lsr #16
+    orr     r4, r2, r4, lsl #19     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r9, r5, lsr #2
+    and     r3, r9, r5
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    and     r2, r10, r5, lsr #18
+    and     r3, r5, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r5, lsr #14
+    orr     r2, r2, r3
+    and     r3, r5, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7, lsr #16
+    orr     r5, r2, r5, lsl #19     //KEY_TRIPLE_UPDATE_4(r5)
+    and     r2, r9, r4, lsr #2
+    and     r3, r9, r4
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_DOUBLE_UPDATE_4(r4)
+    str.w   r5, [r1, #4]
+    str.w   r4, [r1], #80
+    and     r2, r10, r4, lsr #18
+    and     r3, r4, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r4, lsr #14
+    orr     r2, r2, r3
+    and     r3, r4, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7, lsr #16
+    orr     r4, r2, r4, lsl #19     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r9, r5, lsr #2
+    and     r3, r9, r5
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    bx      lr
+
+.align 2
+key_update_4:
+    ldrd    r4, r5, [r1], #80
+    and     r2, r7, r4, lsr #6
+    and     r3, r4, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r4, lsr #4
+    orr     r2, r2, r3
+    and     r4, r4, #0x000f
+    orr     r4, r2, r4, lsl #12     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r10, r5, lsr #4
+    and     r3, r5, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r5, lsr #8
+    orr     r2, r2, r3
+    and     r5, r5, r8
+    orr     r5, r2, r5, lsl #8      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    and     r2, r7, r5, lsr #6
+    and     r3, r5, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r5, lsr #4
+    orr     r2, r2, r3
+    and     r5, r5, #0x000f
+    orr     r5, r2, r5, lsl #12     //KEY_TRIPLE_UPDATE_4(r5)
+    and     r2, r10, r4, lsr #4
+    and     r3, r4, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r4, lsr #8
+    orr     r2, r2, r3
+    and     r4, r4, r8
+    orr     r4, r2, r4, lsl #8      //KEY_DOUBLE_UPDATE_4(r4)
+    str.w   r5, [r1, #4]
+    str.w   r4, [r1], #80
+    and     r2, r7, r4, lsr #6
+    and     r3, r4, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r4, lsr #4
+    orr     r2, r2, r3
+    and     r4, r4, #0x000f
+    orr     r4, r2, r4, lsl #12     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r10, r5, lsr #4
+    and     r3, r5, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r5, lsr #8
+    orr     r2, r2, r3
+    and     r5, r5, r8
+    orr     r5, r2, r5, lsl #8      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r4, [r1, #4]
+    str.w   r5, [r1], #80
+    bx      lr
+
+/*****************************************************************************
+* Balanced implementation of the GIFT-128 key schedule according to the 
+* fixsliced representation.
+*****************************************************************************/
+.align 2
+@ void gift128_keyschedule(const u8* key, u32* rkey) {
+.global gift128_keyschedule
+.type   gift128_keyschedule,%function
+gift128_keyschedule:
+    push    {r1-r12, r14}
+    ldm     r0, {r4-r7}             //load key words
+    rev     r4, r4                  //endianness (could be skipped with another representation)
+    rev     r5, r5                  //endianness (could be skipped with another representation)
+    rev     r6, r6                  //endianness (could be skipped with another representation)
+    rev     r7, r7                  //endianness (could be skipped with another representation)
+    str.w   r5, [r1, #4]
+    str.w   r7, [r1], #8            //the first rkeys are not updated  
+    str.w   r4, [r1, #4]
+    str.w   r6, [r1], #8            //the first rkeys are not updated
+    movw    r12, #0x3fff
+    lsl     r12, r12, #16           //r12<- 0x3fff0000
+    movw    r10, #0x000f            //r10<- 0x0000000f
+    movw    r9, #0x0fff             //r9 <- 0x00000fff
+    bl      classical_key_update    //keyschedule using classical representation (10 rounds)
+    bl      classical_key_update    //keyschedule using classical representation (20 rounds)
+    sub.w   r1, r1, #80
+    movw    r3, #0x0055
+    movt    r3, #0x0055             //r3 <- 0x00550055
+    movw    r10, #0x3333            //r10<- 0x00003333
+    movw    r11, #0x000f
+    movt    r11, #0x000f            //r11<- 0x000f000f
+    bl      rearrange_rkey_0        //fixslice the rkeys
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0        //fixslice the rkeys
+    sub.w   r1, r1, #32
+    movw    r3, #0x1111
+    movt    r3, #0x1111             //r3 <- 0x11111111
+    movw    r10, #0x0303
+    movt    r10, #0x0303            //r10<- 0x03030303
+    bl      rearrange_rkey_1        //fixslice the rkeys
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1        //fixslice the rkeys
+    sub.w   r1, r1, #32
+    movw    r3, #0xaaaa             //r3 <- 0x0000aaaa
+    movw    r10, #0x3333            //r10<- 0x00003333
+    movw    r11, #0xf0f0            //r11<- 0x0000f0f0
+    bl      rearrange_rkey_2        //fixslice the rkeys
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2        //fixslice the rkeys
+    sub.w   r1, r1, #32
+    movw    r3, #0x0a0a
+    movt    r3, #0x0a0a             //r3 <- 0x0a0a0a0a
+    movw    r10, #0x00cc
+    movt    r10, #0x00cc            //r10<- 0x00cc00cc
+    bl      rearrange_rkey_3        //fixslice the rkeys
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3        //fixslice the rkeys
+    sub.w   r1, r1, #64
+    movw    r10, #0x3333            //r10<- 0x00003333
+    eor     r12, r10, r10, lsl #16  //r12<- 0w33333333 
+    mvn     r11, r12                //r11<- 0xcccccccc
+    movw    r9, #0x4444
+    movt    r9, #0x5555             //r9 <- 0x55554444
+    movw    r8, #0x1100
+    movt    r8, #0x5555             //r8 <- 0x55551100
+    bl      key_update_0            //keyschedule according to fixslicing
+    sub.w   r1, r1, #280
+    bl      key_update_0            //keyschedule according to fixslicing
+    sub.w   r1, r1, #352
+    movw    r12, #0x0f00
+    movt    r12, #0x0f00            //r12<- 0x0f000f00
+    movw    r11, #0x0003
+    movt    r11, #0x0003            //r11<- 0x00030003
+    movw    r10, #0x003f
+    movt    r10, #0x003f            //r10<- 0x003f003f
+    lsl     r9, r11, #8             //r9 <- 0x03000300
+    and     r8, r10, r10, lsr #3    //r8 <- 0x00070007
+    orr     r7, r8, r8, lsl #2      //r7 <- 0x001f001f
+    bl      key_update_1            //keyschedule according to fixslicing
+    sub.w   r1, r1, #280
+    bl      key_update_1            //keyschedule according to fixslicing
+    sub.w   r1, r1, #352
+    movw    r12, #0x5555
+    movt    r12, #0x5555            //r12<- 0x55555555
+    mvn     r11, r12                //r11<- 0xaaaaaaaa
+    bl      key_update_2            //keyschedule according to fixslicing
+    sub.w   r1, r1, #280
+    bl      key_update_2            //keyschedule according to fixslicing
+    sub.w   r1, r1, #352
+    orr     r12, r8, r8, lsl #8     //r12<- 0x07070707
+    movw    r11, #0xc0c0            //r11<- 0x0000c0c0
+    movw    r10, #0x3030            //r10<- 0x00003030
+    and     r9, r12, r12, lsr #1    //r9 <- 0x03030303
+    lsl     r8, r12, #4             //r8 <- 0x70707070
+    eor     r7, r8, r9, lsl #5      //r7 <- 0x10101010
+    movw    r6, #0xf0f0             //r6 <- 0x0000f0f0
+    bl      key_update_3            //keyschedule according to fixslicing
+    sub.w   r1, r1, #280
+    bl      key_update_3            //keyschedule according to fixslicing
+    sub.w   r1, r1, #352
+    movw    r12, #0x0fff
+    lsl     r10, r12, #16
+    movw    r8, #0x00ff             //r8 <- 0x000000ff
+    movw    r7, #0x03ff             //r7 <- 0x000003ff
+    lsl     r7, r7, #16
+    bl      key_update_4            //keyschedule according to fixslicing
+    sub.w   r1, r1, #280
+    bl      key_update_4            //keyschedule according to fixslicing
+    pop     {r1-r12,r14}
+    bx      lr
+
+.align 2
+quintuple_round:
+    str.w   r14, [sp]
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r11, r9             //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    and     r8, r4, r12, lsr #1     //permutation layer
+    and     r12, r12, r2
+    orr     r12, r8, r12, lsl #3    //r12<- NIBBLE_ROR(r12, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1    //r11<- NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1     //r14 <- 0x33333333
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2    //r10<- NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r11, r7            //add 2nd keyword
+    eor     r9, r9, r5              //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r12, r11            //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    mvn     r14, r3, lsl #12        //r0 <- 0x0fff0fff
+    and     r8, r14, r9, lsr #4
+    and     r9, r9, r3
+    orr     r9, r8, r9, lsl #12     //r9 <- HALF_ROR(r9, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4    //r11<- HALF_ROR(r11, 12)
+    rev16   r10, r10                //r10<- HALF_ROR(r10, 8)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r11, r7            //add 2nd keyword
+    eor     r12, r12, r5            //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r9, r11             //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    orr     r14, r2, r2, lsl #2     //r14 <- 0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1    //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r12, r12, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r12, r12, r8
+    eor     r12, r12, r8, lsl #1    //SWAPMOVE(r12, r12, 0x55550000, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1    //SWAPMOVE(r11, r11, 0x00005555, 1)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r7, r11, ror #16   //add 2nd keyword
+    eor     r9, r9, r5              //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r11, r12, ror #16   //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #16
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r14, r3, r3, lsl #8     //r14 <- 0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4    //r10<- BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2   //r14 <- 0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14                 //r8 <- 0xc0c0c0c0 for BYTE_ROR
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8            //r11<- BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r9, lsr #6
+    and     r9, r14, r9
+    orr     r9, r8, r9, lsl #2      //r9 <- BYTE_ROR(r9, 6)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r11, r7            //add 2nd keyword
+    eor     r12, r12, r5            //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    ldr.w   lr, [sp]                //restore link register
+    and     r8, r9, r11             //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12, ror #24
+    eor     r10, r6, r10, ror #16   //add 1st keyword
+    eor     r11, r7, r11, ror #8    //add 2nd keyword
+    eor     r9, r9, r5              //add rconst
+    eor     r9, r9, r12             //swap r9 with r12
+    eor     r12, r12, r9            //swap r9 with r12
+    eor     r9, r9, r12             //swap r9 with r12
+    bx      lr
+
+/*****************************************************************************
+* Balanced ARM assembly implementation of the GIFTb-128 block cipher.
+* This function simply encrypts a 128-bit block, without any operation mode.
+*****************************************************************************/
+.align 2
+@ void giftb128_encrypt_block(u8 *out, const u32* rkey, const u8 *block)
+.global giftb128_encrypt_block
+.type   giftb128_encrypt_block,%function
+giftb128_encrypt_block:
+    push    {r0,r2-r12,r14}
+    sub.w   sp, #4              //to store 'lr' when calling 'quintuple_round'
+    ldm     r2, {r9-r12}        // load plaintext words
+    rev     r9, r9
+    rev     r10, r10
+    rev     r11, r11
+    rev     r12, r12
+    movw    r2, #0x1111
+    movt    r2, #0x1111         //r2 <- 0x11111111 (for NIBBLE_ROR)
+    movw    r3, #0x000f
+    movt    r3, #0x000f         //r3 <- 0x000f000f (for HALF_ROR)
+    mvn     r4, r2, lsl #3      //r4 <- 0x7777777 (for NIBBLE_ROR)
+    adr     r0, rconst          //r0 <- 'rconst' address
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    ldr.w   r0, [sp ,#4]        //restore 'ctext' address
+    rev     r9, r9
+    rev     r10, r10
+    rev     r11, r11
+    rev     r12, r12
+    stm     r0, {r9-r12}
+    add.w   sp, #4
+    pop     {r0,r2-r12,r14}
+    bx      lr
+    
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/api.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/api.h
+#define CRYPTO_KEYBYTES     16
+#define CRYPTO_NSECBYTES    0
+#define CRYPTO_NPUBBYTES    16
+#define CRYPTO_ABYTES       16
+#define CRYPTO_NOOVERLAP    1
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/cofb.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/cofb.h
+#ifndef GIFT_COFB_H_
+#define GIFT_COFB_H_
+
+#define TAG_SIZE        16
+#define COFB_ENCRYPT    1
+#define COFB_DECRYPT    0
+
+#define XOR_BLOCK(x, y, z) ({       \
+    (x)[0] = (y)[0] ^ (z)[0];       \
+    (x)[1] = (y)[1] ^ (z)[1];       \
+    (x)[2] = (y)[2] ^ (z)[2];       \
+    (x)[3] = (y)[3] ^ (z)[3];       \
+})
+
+#define XOR_TOP_BAR_BLOCK(x, y) ({  \
+    (x)[0] ^= (y)[0];               \
+    (x)[1] ^= (y)[1];               \
+})
+
+#endif // GIFT_COFB_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/encrypt.c
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/encrypt.c
+#include <string.h>
+#include "api.h"
+#include "cofb.h"
+#include "giftb128.h"
+
+static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){
+    u32 i;
+    if (no_of_bytes == 0) {
+        d[0] = 0x00000080; // little-endian
+        d[1] = 0x00000000;
+        d[2] = 0x00000000;
+        d[3] = 0x00000000;
+    }
+    else if (no_of_bytes < GIFT128_BLOCK_SIZE) {
+        for (i = 0; i < no_of_bytes/4+1; i++)
+            d[i] = s[i];
+        d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8);
+        d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8;
+        for (; i < 4; i++)
+            d[i] = 0x00000000;
+    }
+    else {
+        d[0] = s[0];
+        d[1] = s[1];
+        d[2] = s[2];
+        d[3] = s[3];
+    }
+}
+
+static inline void double_half_block(u32* x) {
+    u32 tmp0;
+    tmp0 = (x)[0];
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;
+}
+
+static inline void triple_half_block(u32* x) {
+    u32 tmp0, tmp1;
+    tmp0 = (x)[0];
+    tmp1 = (x)[1];
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;
+    (x)[0] ^= tmp0;
+    (x)[1] ^= tmp1;
+}
+
+static inline void g(u32 *x) {
+    u32 tmp0, tmp1;
+    tmp0 = (x)[0];
+    tmp1 = (x)[1];
+    (x)[0] = (x)[2];
+    (x)[1] = (x)[3];
+    (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15);
+    (x)[2] |= ((tmp1 & 0x80808080) << 17);
+    (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15);
+    (x)[3] |= ((tmp0 & 0x80808080) << 17);
+}
+
+static inline void rho1(u32* d, u32* y, u32* m, u32 n) {
+    g(y);
+    padding(d,m,n);
+    XOR_BLOCK(d, d, y);
+}
+
+static inline void rho(u32* y, u32* m, u32* x, u32* c, u32 n) {
+    XOR_BLOCK(c, y, m);
+    rho1(x, y, m, n);
+}
+
+static inline void rho_prime(u32* y, u32*c, u32* x, u32* m, u32 n) {
+    XOR_BLOCK(m, y, c);
+    rho1(x, y, m, n);
+}
+
+/****************************************************************************
+* Constant-time implementation of the GIFT-COFB authenticated cipher based on
+* fixsliced GIFTb-128. Encryption/decryption is handled by the same function,
+* depending on the 'mode' parameter (1/0).
+****************************************************************************/
+int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad,
+                u32 ad_len, const u8* in, u32 in_len, const int encrypting) {
+
+    u32 tmp0, tmp1, emptyA, emptyM, offset[2];
+    u32 input[4], rkey[80];
+    u8 Y[GIFT128_BLOCK_SIZE];
+
+    if (!encrypting) {
+        if (in_len < TAG_SIZE)
+            return -1;
+        in_len -= TAG_SIZE;
+    }
+
+    if(ad_len == 0)
+        emptyA = 1;
+    else
+        emptyA = 0;
+
+    if(in_len == 0)
+        emptyM =1;
+    else
+        emptyM = 0;
+
+    gift128_keyschedule(key, rkey);
+    giftb128_encrypt_block(Y, rkey, nonce);
+    offset[0] = ((u32*)Y)[0];
+    offset[1] = ((u32*)Y)[1];
+
+    while(ad_len > GIFT128_BLOCK_SIZE){
+        rho1(input, (u32*)Y, (u32*)ad, GIFT128_BLOCK_SIZE);
+        double_half_block(offset);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+        ad += GIFT128_BLOCK_SIZE;
+        ad_len -= GIFT128_BLOCK_SIZE;
+    }
+    
+    triple_half_block(offset);
+    if((ad_len % GIFT128_BLOCK_SIZE != 0) || (emptyA))
+        triple_half_block(offset);
+    if(emptyM) {
+        triple_half_block(offset);
+        triple_half_block(offset);
+    }
+
+    rho1(input, (u32*)Y, (u32*)ad, ad_len);
+    XOR_TOP_BAR_BLOCK(input, offset);
+    giftb128_encrypt_block(Y, rkey, (u8*)input);
+
+    while (in_len > GIFT128_BLOCK_SIZE){
+        double_half_block(offset);
+        if (encrypting)
+            rho((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE);
+        else
+            rho_prime((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+        in += GIFT128_BLOCK_SIZE;
+        out += GIFT128_BLOCK_SIZE;
+        in_len -= GIFT128_BLOCK_SIZE;
+    }
+    
+    if(!emptyM){
+        triple_half_block(offset);
+        if(in_len % GIFT128_BLOCK_SIZE != 0)
+            triple_half_block(offset);
+        if (encrypting) {
+            rho((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            out += in_len;
+        }
+        else {
+            rho_prime((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            in += in_len;
+        }
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+    }
+    
+    if (encrypting) { // encryption mode
+        memcpy(out, Y, TAG_SIZE);
+        return 0;
+    }
+    // decrypting
+    tmp0 = 0;
+    for(tmp1 = 0; tmp1 < TAG_SIZE; tmp1++)
+        tmp0 |= in[tmp1] ^ Y[tmp1];
+    return tmp0;
+}
+
+int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
+                    const unsigned char* m, unsigned long long mlen,
+                    const unsigned char* ad, unsigned long long adlen,
+                    const unsigned char* nsec, const unsigned char* npub,
+                    const unsigned char* k) {
+    (void)nsec;
+    *clen = mlen + TAG_SIZE;
+    return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT);
+}
+
+int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen,
+                    unsigned char* nsec, const unsigned char* c,
+                    unsigned long long clen, const unsigned char* ad,
+                    unsigned long long adlen, const unsigned char* npub,
+                    const unsigned char *k) {
+    (void)nsec;
+    *mlen = clen - TAG_SIZE;
+    return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT);
+}
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.h
+#ifndef GIFT128_H_
+#define GIFT128_H_
+
+#define KEY_SIZE    		16
+#define GIFT128_BLOCK_SIZE  16
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+extern void gift128_keyschedule(const u8* key, u32* rkey);
+extern void giftb128_encrypt_block(u8* out_block, const u32* rkey, const u8* in_block);
+
+#endif  // GIFT128_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.s
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.s
+/****************************************************************************
+* Compact ARM assembly implementation of the GIFT-128 block cipher. This
+* implementation focuses on code size rather than speed.
+* See "Fixslicing: A New GIFT Representation" paper available at 
+* https:// for more details.
+****************************************************************************/
+
+.syntax unified
+.thumb
+
+/*****************************************************************************
+* Round constants look-up table according to the fixsliced representation.
+*****************************************************************************/
+.align 2
+.type rconst,%object
+rconst:
+.word 0x10000008, 0x80018000, 0x54000002, 0x01010181
+.word 0x8000001f, 0x10888880, 0x6001e000, 0x51500002
+.word 0x03030180, 0x8000002f, 0x10088880, 0x60016000
+.word 0x41500002, 0x03030080, 0x80000027, 0x10008880
+.word 0x4001e000, 0x11500002, 0x03020180, 0x8000002b
+.word 0x10080880, 0x60014000, 0x01400002, 0x02020080
+.word 0x80000021, 0x10000080, 0x0001c000, 0x51000002
+.word 0x03010180, 0x8000002e, 0x10088800, 0x60012000
+.word 0x40500002, 0x01030080, 0x80000006, 0x10008808
+.word 0xc001a000, 0x14500002, 0x01020181, 0x8000001a
+
+.align 2
+key_update:
+    and     r2, r10, r7, lsr #12
+    and     r3, r7, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r7, lsr #2
+    orr     r2, r2, r3
+    and     r7, r7, #0x00030000
+    orr     r7, r2, r7, lsl #14
+    strd    r5, r7, [r1], #8        //store rkeys after 1st key update
+    and     r2, r10, r6, lsr #12
+    and     r3, r6, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r6, lsr #2
+    orr     r2, r2, r3
+    and     r6, r6, #0x00030000
+    orr     r6, r2, r6, lsl #14
+    strd    r4, r6, [r1], #8        //store rkeys after 2nd key update
+    and     r2, r10, r5, lsr #12
+    and     r3, r5, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r5, lsr #2
+    orr     r2, r2, r3
+    and     r5, r5, #0x00030000
+    orr     r5, r2, r5, lsl #14
+    strd    r7, r5, [r1], #8        //store rkeys after 3rd key update
+    and     r2, r10, r4, lsr #12
+    and     r3, r4, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r4, lsr #2
+    orr     r2, r2, r3
+    and     r4, r4, #0x00030000
+    orr     r4, r2, r4, lsl #14
+    strd    r6, r4, [r1], #8        //store rkeys after 4th key update
+    bx      lr
+
+.align 2
+rearrange_rkey_0:
+    ldrd    r6, r4, [r1]
+    eor     r12, r6, r6, lsr #9
+    and     r12, r12, r3
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #9     //SWAPMOVE(r6, r6, 0x00550055, 9);
+    eor     r12, r4, r4, lsr #9
+    and     r12, r12, r3
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #9     //SWAPMOVE(r4, r4, 0x00550055, 9);
+    eor     r12, r6, r6, lsr #18
+    and     r12, r12, r10
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #18    //SWAPMOVE(r6, r6, 0x3333, 18);
+    eor     r12, r4, r4, lsr #18
+    and     r12, r12, r10
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #18    //SWAPMOVE(r4, r4, 0x3333, 18);
+    eor     r12, r6, r6, lsr #12
+    and     r12, r12, r11
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #12    //SWAPMOVE(r6, r6, 0x000f000f, 12);
+    eor     r12, r4, r4, lsr #12
+    and     r12, r12, r11
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #12    //SWAPMOVE(r4, r4, 0x000f000f, 12);
+    eor     r12, r6, r6, lsr #24
+    and     r12, r12, #0xff
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #24    //SWAPMOVE(r6, r6, 0x000000ff, 24);
+    eor     r12, r4, r4, lsr #24
+    and     r12, r12, #0xff
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #24    //SWAPMOVE(r4, r4, 0x000000ff, 24);
+    strd    r6, r4, [r1]
+    bx      lr
+
+.align 2
+rearrange_rkey_1:
+    ldrd    r5, r7, [r1]
+    eor     r8, r7, r7, lsr #3
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #3      //SWAPMOVE(r7, r7, 0x11111111, 3);
+    eor     r8, r5, r5, lsr #3
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #3      //SWAPMOVE(r5, r5, 0x11111111, 3);
+    eor     r8, r7, r7, lsr #6
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #6      //SWAPMOVE(r7, r7, 0x03030303, 6);
+    eor     r8, r5, r5, lsr #6
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #6      //SWAPMOVE(r5, r5, 0x03030303, 6);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    strd    r5, r7, [r1]
+    bx      lr
+
+.align 2
+rearrange_rkey_2:
+    ldrd    r5, r7, [r1]
+    eor     r8, r7, r7, lsr #15
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #15     //SWAPMOVE(r7, r7, 0x0000aaaa, 15);
+    eor     r8, r5, r5, lsr #15
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #15     //SWAPMOVE(r5, r5, 0x0000aaaa, 15);
+    eor     r8, r7, r7, lsr #18
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #18     //SWAPMOVE(r7, r7, 0x00003333, 18);
+    eor     r8, r5, r5, lsr #18
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #18     //SWAPMOVE(r5, r5, 0x00003333, 18);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x00000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    strd    r5, r7, [r1]
+    bx      lr
+
+.align 2
+rearrange_rkey_3:
+    ldrd    r5, r7, [r1]
+    eor     r8, r7, r7, lsr #3
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #3      //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3);
+    eor     r8, r5, r5, lsr #3
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #3      //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3);
+    eor     r8, r7, r7, lsr #6
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #6      //SWAPMOVE(r7, r7, 0x00cc00cc, 6);
+    eor     r8, r5, r5, lsr #6
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #6      //SWAPMOVE(r5, r5, 0x00cc00cc, 6);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    strd    r5, r7, [r1]
+    bx      lr
+
+/*****************************************************************************
+* Code size optimized implementation of the GIFTb-128 key schedule.
+* Compute the key schedule in the normal representation and then rearrange all
+* the round keys in their respective fixsliced representations.
+*****************************************************************************/
+.align 2
+@ void gift128_keyschedule(const u8* key, u32* rkey)
+.global gift128_keyschedule
+.type   gift128_keyschedule,%function
+gift128_keyschedule:
+    push    {r1-r12, r14}
+    ldm     r0, {r4-r7}             //load key words
+    rev     r4, r4
+    rev     r5, r5
+    rev     r6, r6
+    rev     r7, r7
+    strd    r7, r5, [r1], #8        //the first rkeys are not updated
+    strd    r6, r4, [r1], #8        //the first rkeys are not updated
+    // keyschedule using classical representation for the first 20 rounds
+    movw    r12, #0x3fff
+    lsl     r12, r12, #16           //r12<- 0x3fff0000
+    movw    r10, #0x000f            //r10<- 0x0000000f
+    movw    r9, #0x0fff             //r9 <- 0x00000fff
+    bl      key_update
+    bl      key_update
+    bl      key_update
+    bl      key_update
+    bl      key_update
+    bl      key_update
+    bl      key_update
+    bl      key_update
+    bl      key_update
+    and     r2, r10, r7, lsr #12
+    and     r3, r7, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r7, lsr #2
+    orr     r2, r2, r3
+    and     r7, r7, #0x00030000
+    orr     r7, r2, r7, lsl #14
+    strd    r5, r7, [r1], #8        //penultimate key update
+    and     r2, r10, r6, lsr #12
+    and     r3, r6, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r6, lsr #2
+    orr     r2, r2, r3
+    and     r6, r6, #0x00030000
+    orr     r6, r2, r6, lsl #14
+    strd    r4, r6, [r1], #8        //ultimate key update
+    sub.w   r1, r1, #320
+    // rearrange the rkeys to their respective new representations
+    movw    r3, #0x0055
+    movt    r3, #0x0055             //r3 <- 0x00550055
+    movw    r10, #0x3333            //r10<- 0x00003333
+    movw    r11, #0x000f
+    movt    r11, #0x000f            //r11<- 0x000f000f
+    bl      rearrange_rkey_0
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_0
+    sub.w   r1, r1, #272
+    movw    r3, #0x1111
+    movt    r3, #0x1111             //r3 <- 0x11111111
+    movw    r10, #0x0303
+    movt    r10, #0x0303            //r10<- 0x03030303
+    bl      rearrange_rkey_1
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_1
+    sub.w   r1, r1, #272
+    movw    r3, #0xaaaa             //r3 <- 0x0000aaaa
+    movw    r10, #0x3333            //r10<- 0x00003333
+    movw    r11, #0xf0f0            //r11<- 0x0000f0f0
+    bl      rearrange_rkey_2
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_2
+    sub.w   r1, r1, #272
+    movw    r3, #0x0a0a
+    movt    r3, #0x0a0a             //r3 <- 0x0a0a0a0a
+    movw    r10, #0x00cc
+    movt    r10, #0x00cc            //r10<- 0x00cc00cc
+    bl      rearrange_rkey_3
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3
+    add.w   r1, r1, #40
+    bl      rearrange_rkey_3
+    pop     {r1-r12, r14}
+    bx      lr
+
+.align 2
+quintuple_round:
+    str.w   r14, [sp]
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r11, r9             //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    and     r8, r4, r12, lsr #1     //permutation layer
+    and     r12, r12, r2
+    orr     r12, r8, r12, lsl #3    //r12<- NIBBLE_ROR(r12, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1    //r11<- NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1     //r14 <- 0x33333333
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2    //r10<- NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r11, r7            //add 2nd keyword
+    eor     r9, r9, r5              //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r12, r11            //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    mvn     r14, r3, lsl #12        //r0 <- 0x0fff0fff
+    and     r8, r14, r9, lsr #4
+    and     r9, r9, r3
+    orr     r9, r8, r9, lsl #12     //r9 <- HALF_ROR(r9, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4    //r11<- HALF_ROR(r11, 12)
+    rev16   r10, r10                //r10<- HALF_ROR(r10, 8)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r11, r7            //add 2nd keyword
+    eor     r12, r12, r5            //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r9, r11             //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    orr     r14, r2, r2, lsl #2     //r14 <- 0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1    //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r12, r12, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r12, r12, r8
+    eor     r12, r12, r8, lsl #1    //SWAPMOVE(r12, r12, 0x55550000, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1    //SWAPMOVE(r11, r11, 0x00005555, 1)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r7, r11, ror #16   //add 2nd keyword
+    eor     r9, r9, r5              //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    and     r8, r11, r12, ror #16   //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #16
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r14, r3, r3, lsl #8     //r14 <- 0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4    //r10<- BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2   //r14 <- 0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14                 //r8 <- 0xc0c0c0c0 for BYTE_ROR
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8            //r11<- BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r9, lsr #6
+    and     r9, r14, r9
+    orr     r9, r8, r9, lsl #2      //r9 <- BYTE_ROR(r9, 6)
+    eor     r10, r10, r6            //add 1st keyword
+    eor     r11, r11, r7            //add 2nd keyword
+    eor     r12, r12, r5            //add rconst
+    ldr.w   r5, [r0], #4
+    ldr.w   r6, [r1], #4            //load rkey
+    ldr.w   r7, [r1], #4            //load rkey
+    ldr.w   lr, [sp]                //restore link register
+    and     r8, r9, r11             //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12, ror #24
+    eor     r10, r6, r10, ror #16   //add 1st keyword
+    eor     r11, r7, r11, ror #8    //add 2nd keyword
+    eor     r9, r9, r5              //add rconst
+    eor     r9, r9, r12             //swap r9 with r12
+    eor     r12, r12, r9            //swap r9 with r12
+    eor     r9, r9, r12             //swap r9 with r12
+    bx      lr
+
+/*****************************************************************************
+* Code size optimized implementation of the GIFTb-128 block cipher.
+* This function simply encrypts a 128-bit block, without any operation mode.
+*****************************************************************************/
+.align 2
+@ void giftb128_encrypt_block(u8 *out, const u32* rkey, const u8 *block)
+.global giftb128_encrypt_block
+.type   giftb128_encrypt_block,%function
+giftb128_encrypt_block:
+    push    {r0,r2-r12,r14}
+    sub.w   sp, #4              //to store 'lr' when calling 'quintuple_round'
+    ldm     r2, {r9-r12}        // load plaintext words
+    rev     r9, r9
+    rev     r10, r10
+    rev     r11, r11
+    rev     r12, r12
+    movw    r2, #0x1111
+    movt    r2, #0x1111         //r2 <- 0x11111111 (for NIBBLE_ROR)
+    movw    r3, #0x000f
+    movt    r3, #0x000f         //r3 <- 0x000f000f (for HALF_ROR)
+    mvn     r4, r2, lsl #3      //r4 <- 0x7777777 (for NIBBLE_ROR)
+    adr     r0, rconst          //r0 <- 'rconst' address
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    bl      quintuple_round
+    ldr.w   r0, [sp ,#4]        //restore 'ctext' address
+    rev     r9, r9
+    rev     r10, r10
+    rev     r11, r11
+    rev     r12, r12
+    stm     r0, {r9-r12}
+    add.w   sp, #4
+    pop     {r0,r2-r12,r14}
+    bx      lr
+    
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/api.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/api.h
+#define CRYPTO_KEYBYTES     16
+#define CRYPTO_NSECBYTES    0
+#define CRYPTO_NPUBBYTES    16
+#define CRYPTO_ABYTES       16
+#define CRYPTO_NOOVERLAP    1
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/cofb.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/cofb.h
+#ifndef GIFT_COFB_H_
+#define GIFT_COFB_H_
+
+#define TAG_SIZE        16
+#define COFB_ENCRYPT    1
+#define COFB_DECRYPT    0
+
+#define DOUBLE_HALF_BLOCK(x) ({                                             \
+    tmp0 = (x)[0];                                                          \
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);  \
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;                                  \
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);  \
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;                               \
+})
+
+#define TRIPLE_HALF_BLOCK(x) ({                                             \
+    tmp0 = (x)[0];                                                          \
+    tmp1 = (x)[1];                                                          \
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);  \
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;                                  \
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);  \
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;                               \
+    (x)[0] ^= tmp0;                                                         \
+    (x)[1] ^= tmp1;                                                         \
+})
+
+#define G(x) ({                                                             \
+    tmp0 = (x)[0];                                                          \
+    tmp1 = (x)[1];                                                          \
+    (x)[0] = (x)[2];                                                        \
+    (x)[1] = (x)[3];                                                        \
+    (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15);      \
+    (x)[2] |= ((tmp1 & 0x80808080) << 17);                                  \
+    (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15);      \
+    (x)[3] |= ((tmp0 & 0x80808080) << 17);                                  \
+})
+
+#define XOR_BLOCK(x, y, z) ({       \
+    (x)[0] = (y)[0] ^ (z)[0];       \
+    (x)[1] = (y)[1] ^ (z)[1];       \
+    (x)[2] = (y)[2] ^ (z)[2];       \
+    (x)[3] = (y)[3] ^ (z)[3];       \
+})
+
+#define XOR_TOP_BAR_BLOCK(x, y) ({  \
+    (x)[0] ^= (y)[0];               \
+    (x)[1] ^= (y)[1];               \
+})
+
+#define RHO1(d, y, m, n) ({         \
+    G(y);                           \
+    padding(d,m,n);                 \
+    XOR_BLOCK(d, d, y);             \
+})
+
+#define RHO(y, m, x, c, n) ({       \
+    XOR_BLOCK(c, y, m);             \
+    RHO1(x, y, m, n);               \
+})
+
+#define RHO_PRIME(y, c, x, m, n) ({ \
+    XOR_BLOCK(m, y, c);             \
+    RHO1(x, y, m, n);               \
+})
+
+#endif // GIFT_COFB_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/encrypt.c
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/encrypt.c
+#include <string.h>
+#include "cofb.h"
+#include "giftb128.h"
+
+static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){
+    u32 i;
+    if (no_of_bytes == 0) {
+        d[0] = 0x00000080; // little-endian
+        d[1] = 0x00000000;
+        d[2] = 0x00000000;
+        d[3] = 0x00000000;
+    }
+    else if (no_of_bytes < GIFT128_BLOCK_SIZE) {
+        for (i = 0; i < no_of_bytes/4+1; i++)
+            d[i] = s[i];
+        d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8);
+        d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8;
+        for (; i < 4; i++)
+            d[i] = 0x00000000;
+    }
+    else {
+        d[0] = s[0];
+        d[1] = s[1];
+        d[2] = s[2];
+        d[3] = s[3];
+    }
+}
+
+/****************************************************************************
+* Constant-time implementation of the GIFT-COFB authenticated cipher based on
+* fixsliced GIFTb-128. Encryption/decryption is handled by the same function,
+* depending on the 'mode' parameter (1/0).
+ ***************************************************************************/
+int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad,
+                u32 ad_len, const u8* in, u32 in_len, const int encrypting) {
+
+    u32 tmp0, tmp1, emptyA, emptyM, offset[2];
+    u32 input[4], rkey[80];
+    u8 Y[GIFT128_BLOCK_SIZE];
+
+    if (!encrypting) {
+        if (in_len < TAG_SIZE)
+            return -1;
+        in_len -= TAG_SIZE;
+    }
+
+    if(ad_len == 0)
+        emptyA = 1;
+    else
+        emptyA = 0;
+
+    if(in_len == 0)
+        emptyM =1;
+    else
+        emptyM = 0;
+
+    gift128_keyschedule(key, rkey);
+    giftb128_encrypt_block(Y, rkey, nonce);
+    offset[0] = ((u32*)Y)[0];
+    offset[1] = ((u32*)Y)[1];
+
+    while(ad_len > GIFT128_BLOCK_SIZE){
+        RHO1(input, (u32*)Y, (u32*)ad, GIFT128_BLOCK_SIZE);
+        DOUBLE_HALF_BLOCK(offset);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+        ad += GIFT128_BLOCK_SIZE;
+        ad_len -= GIFT128_BLOCK_SIZE;
+    }
+    
+    TRIPLE_HALF_BLOCK(offset);
+    if((ad_len % GIFT128_BLOCK_SIZE != 0) || (emptyA))
+        TRIPLE_HALF_BLOCK(offset);
+    if(emptyM) {
+        TRIPLE_HALF_BLOCK(offset);
+        TRIPLE_HALF_BLOCK(offset);
+    }
+
+    RHO1(input, (u32*)Y, (u32*)ad, ad_len);
+    XOR_TOP_BAR_BLOCK(input, offset);
+    giftb128_encrypt_block(Y, rkey, (u8*)input);
+
+    while (in_len > GIFT128_BLOCK_SIZE){
+        DOUBLE_HALF_BLOCK(offset);
+        if (encrypting)
+            RHO((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE);
+        else
+            RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+        in += GIFT128_BLOCK_SIZE;
+        out += GIFT128_BLOCK_SIZE;
+        in_len -= GIFT128_BLOCK_SIZE;
+    }
+    
+    if(!emptyM){
+        TRIPLE_HALF_BLOCK(offset);
+        if(in_len % GIFT128_BLOCK_SIZE != 0)
+            TRIPLE_HALF_BLOCK(offset);
+        if (encrypting) {
+            RHO((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            out += in_len;
+        }
+        else {
+            RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            in += in_len;
+        }
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128_encrypt_block(Y, rkey, (u8*)input);
+    }
+    
+    if (encrypting) { // encryption mode
+        memcpy(out, Y, TAG_SIZE);
+        return 0;
+    }
+    // decrypting
+    tmp0 = 0;
+    for(tmp1 = 0; tmp1 < TAG_SIZE; tmp1++)
+        tmp0 |= in[tmp1] ^ Y[tmp1];
+    return tmp0;
+}
+
+int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
+                    const unsigned char* m, unsigned long long mlen,
+                    const unsigned char* ad, unsigned long long adlen,
+                    const unsigned char* nsec, const unsigned char* npub,
+                    const unsigned char* k) {
+    (void)nsec;
+    *clen = mlen + TAG_SIZE;
+    return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT);
+}
+
+int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen,
+                    unsigned char* nsec, const unsigned char* c,
+                    unsigned long long clen, const unsigned char* ad,
+                    unsigned long long adlen, const unsigned char* npub,
+                    const unsigned char *k) {
+    (void)nsec;
+    *mlen = clen - TAG_SIZE;
+    return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT);
+}
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.h
+#ifndef GIFT128_H_
+#define GIFT128_H_
+
+#define KEY_SIZE    16
+#define GIFT128_BLOCK_SIZE  16
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+extern void gift128_keyschedule(const u8* key, u32* rkey);
+extern void giftb128_encrypt_block(u8* out_block, const u32* rkey, const u8* in_block);
+
+#endif  // GIFT128_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.s
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.s
+/****************************************************************************
+* Fully unrolled ARM assembly implementation of the GIFTn-128 block cipher.
+* This implementation focuses on speed, at the cost of a large code size.
+* See "Fixslicing: A New GIFT Representation" paper available at 
+* https:// for more details.
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+* @date     March 2020
+****************************************************************************/
+
+.syntax unified
+.thumb
+/*****************************************************************************
+* Fully unrolled implementation of the GIFT-128 key schedule according to the
+* fixsliced representation.
+*****************************************************************************/
+@ void gift128_keyschedule(const u8* key, u32* rkey)
+.global gift128_keyschedule
+.type   gift128_keyschedule,%function
+gift128_keyschedule:
+    push    {r2-r12, r14}
+    ldm     r0, {r4-r7}             //load key words
+    rev     r4, r4
+    rev     r5, r5
+    rev     r6, r6
+    rev     r7, r7
+    str.w   r6, [r1, #8]
+    str.w   r4, [r1, #12]
+    str.w   r7, [r1]
+    str.w   r5, [r1, #4]
+    // keyschedule using classical representation for the first 20 rounds
+    movw    r12, #0x3fff
+    lsl     r12, r12, #16           //r12<- 0x3fff0000
+    movw    r10, #0x000f            //r10<- 0x0000000f
+    movw    r9, #0x0fff             //r9 <- 0x00000fff
+    // 1st classical key update
+    and     r2, r10, r7, lsr #12
+    and     r3, r7, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r7, lsr #2
+    orr     r2, r2, r3
+    and     r7, r7, #0x00030000
+    orr     r7, r2, r7, lsl #14
+    str.w   r5, [r1, #16]
+    str.w   r7, [r1, #20]
+    // 2nd classical key update
+    and     r2, r10, r6, lsr #12
+    and     r3, r6, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r6, lsr #2
+    orr     r2, r2, r3
+    and     r6, r6, #0x00030000
+    orr     r6, r2, r6, lsl #14
+    str.w   r4, [r1, #24]
+    str.w   r6, [r1, #28]
+    // 3rd classical key update
+    and     r2, r10, r5, lsr #12
+    and     r3, r5, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r5, lsr #2
+    orr     r2, r2, r3
+    and     r5, r5, #0x00030000
+    orr     r5, r2, r5, lsl #14
+    str.w   r7, [r1, #32]
+    str.w   r5, [r1, #36]
+    // 4th classical key update
+    and     r2, r10, r4, lsr #12
+    and     r3, r4, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r4, lsr #2
+    orr     r2, r2, r3
+    and     r4, r4, #0x00030000
+    orr     r4, r2, r4, lsl #14
+    str.w   r6, [r1, #40]
+    str.w   r4, [r1, #44]
+    // 5th classical key update
+    and     r2, r10, r7, lsr #12
+    and     r3, r7, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r7, lsr #2
+    orr     r2, r2, r3
+    and     r7, r7, #0x00030000
+    orr     r7, r2, r7, lsl #14
+    str.w   r5, [r1, #48]
+    str.w   r7, [r1, #52]
+    // 6th classical key update
+    and     r2, r10, r6, lsr #12
+    and     r3, r6, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r6, lsr #2
+    orr     r2, r2, r3
+    and     r6, r6, #0x00030000
+    orr     r6, r2, r6, lsl #14
+    str.w   r4, [r1, #56]
+    str.w   r6, [r1, #60]
+    // 7th classical key update
+    and     r2, r10, r5, lsr #12
+    and     r3, r5, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r5, lsr #2
+    orr     r2, r2, r3
+    and     r5, r5, #0x00030000
+    orr     r5, r2, r5, lsl #14
+    str.w   r7, [r1, #64]
+    str.w   r5, [r1, #68]
+    // 8th classical key update
+    and     r2, r10, r4, lsr #12
+    and     r3, r4, r9
+    orr     r2, r2, r3, lsl #4
+    and     r3, r12, r4, lsr #2
+    orr     r2, r2, r3
+    and     r4, r4, #0x00030000
+    orr     r4, r2, r4, lsl #14
+    str.w   r6, [r1, #72]
+    str.w   r4, [r1, #76]
+    // rearrange the rkeys to their respective new representations
+    // REARRANGE_RKEY_0
+    movw    r3, #0x0055
+    movt    r3, #0x0055             //r3 <- 0x00550055
+    movw    r10, #0x3333            //r10<- 0x00003333
+    movw    r11, #0x000f
+    movt    r11, #0x000f            //r11<- 0x000f000f
+    ldrd    r6, r4, [r1]
+    eor     r12, r6, r6, lsr #9
+    and     r12, r12, r3
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #9     //SWAPMOVE(r6, r6, 0x00550055, 9);
+    eor     r12, r4, r4, lsr #9
+    and     r12, r12, r3
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #9     //SWAPMOVE(r4, r4, 0x00550055, 9);
+    eor     r12, r6, r6, lsr #18
+    and     r12, r12, r10
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #18    //SWAPMOVE(r6, r6, 0x3333, 18);
+    eor     r12, r4, r4, lsr #18
+    and     r12, r12, r10
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #18    //SWAPMOVE(r4, r4, 0x3333, 18);
+    eor     r12, r6, r6, lsr #12
+    and     r12, r12, r11
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #12    //SWAPMOVE(r6, r6, 0x000f000f, 12);
+    eor     r12, r4, r4, lsr #12
+    and     r12, r12, r11
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #12    //SWAPMOVE(r4, r4, 0x000f000f, 12);
+    eor     r12, r6, r6, lsr #24
+    and     r12, r12, #0xff
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #24    //SWAPMOVE(r6, r6, 0x000000ff, 24);
+    eor     r12, r4, r4, lsr #24
+    and     r12, r12, #0xff
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #24    //SWAPMOVE(r4, r4, 0x000000ff, 24);
+    strd    r6, r4, [r1]
+    ldrd    r6, r4, [r1, #40]
+    eor     r12, r6, r6, lsr #9
+    and     r12, r12, r3
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #9     //SWAPMOVE(r6, r6, 0x00550055, 9);
+    eor     r12, r4, r4, lsr #9
+    and     r12, r12, r3
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #9     //SWAPMOVE(r4, r4, 0x00550055, 9);
+    eor     r12, r6, r6, lsr #18
+    and     r12, r12, r10
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #18    //SWAPMOVE(r6, r6, 0x3333, 18);
+    eor     r12, r4, r4, lsr #18
+    and     r12, r12, r10
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #18    //SWAPMOVE(r4, r4, 0x3333, 18);
+    eor     r12, r6, r6, lsr #12
+    and     r12, r12, r11
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #12    //SWAPMOVE(r6, r6, 0x000f000f, 12);
+    eor     r12, r4, r4, lsr #12
+    and     r12, r12, r11
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #12    //SWAPMOVE(r4, r4, 0x000f000f, 12);
+    eor     r12, r6, r6, lsr #24
+    and     r12, r12, #0xff
+    eor     r6, r12
+    eor     r6, r6, r12, lsl #24    //SWAPMOVE(r6, r6, 0x000000ff, 24);
+    eor     r12, r4, r4, lsr #24
+    and     r12, r12, #0xff
+    eor     r4, r12
+    eor     r4, r4, r12, lsl #24    //SWAPMOVE(r4, r4, 0x000000ff, 24);
+    str.w   r6, [r1, #40]
+    str.w   r4, [r1, #44]
+    // REARRANGE_RKEY_1
+    movw    r3, #0x1111
+    movt    r3, #0x1111
+    movw    r10, #0x0303
+    movt    r10, #0x0303
+    ldrd    r5, r7, [r1, #8]
+    eor     r8, r7, r7, lsr #3
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #3      //SWAPMOVE(r7, r7, 0x11111111, 3);
+    eor     r8, r5, r5, lsr #3
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #3      //SWAPMOVE(r5, r5, 0x11111111, 3);
+    eor     r8, r7, r7, lsr #6
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #6      //SWAPMOVE(r7, r7, 0x03030303, 6);
+    eor     r8, r5, r5, lsr #6
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #6      //SWAPMOVE(r5, r5, 0x03030303, 6);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    ldr.w   r12, [r1, #48]
+    ldr.w   r14, [r1, #52]
+    str.w   r5, [r1, #8]
+    str.w   r7, [r1, #12]
+    eor     r8, r14, r14, lsr #3
+    and     r8, r8, r3
+    eor     r14, r8
+    eor     r14, r14, r8, lsl #3    //SWAPMOVE(r7, r7, 0x11111111, 3);
+    eor     r8, r12, r12, lsr #3
+    and     r8, r8, r3
+    eor     r12, r8
+    eor     r12, r12, r8, lsl #3    //SWAPMOVE(r5, r5, 0x11111111, 3);
+    eor     r8, r14, r14, lsr #6
+    and     r8, r8, r10
+    eor     r14, r8
+    eor     r14, r14, r8, lsl #6    //SWAPMOVE(r7, r7, 0x03030303, 6);
+    eor     r8, r12, r12, lsr #6
+    and     r8, r8, r10
+    eor     r12, r8
+    eor     r12, r12, r8, lsl #6    //SWAPMOVE(r5, r5, 0x03030303, 6);
+    eor     r8, r14, r14, lsr #12
+    and     r8, r8, r11
+    eor     r14, r8
+    eor     r14, r14, r8, lsl #12   //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r12, r12, lsr #12
+    and     r8, r8, r11
+    eor     r12, r8
+    eor     r12, r12, r8, lsl #12   //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r14, r14, lsr #24
+    and     r8, r8, #0xff
+    eor     r14, r8
+    eor     r14, r14, r8, lsl #24   //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r12, r12, lsr #24
+    and     r8, r8, #0xff
+    eor     r12, r8
+    eor     r12, r12, r8, lsl #24   //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    str.w   r12, [r1, #48]
+    str.w   r14, [r1, #52]
+    // REARRANGE_RKEY_2
+    movw    r3, #0xaaaa
+    movw    r10, #0x3333
+    movw    r11, #0xf0f0
+    ldrd    r5, r7, [r1, #16]
+    eor     r8, r7, r7, lsr #15
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #15     //SWAPMOVE(r7, r7, 0x0000aaaa, 15);
+    eor     r8, r5, r5, lsr #15
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #15     //SWAPMOVE(r5, r5, 0x0000aaaa, 15);
+    eor     r8, r7, r7, lsr #18
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #18     //SWAPMOVE(r7, r7, 0x00003333, 18);
+    eor     r8, r5, r5, lsr #18
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #18     //SWAPMOVE(r5, r5, 0x00003333, 18);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x00000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    strd    r5, r7, [r1, #16]
+    ldrd    r5, r7, [r1, #56]
+    eor     r8, r7, r7, lsr #15
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #15     //SWAPMOVE(r7, r7, 0x0000aaaa, 15);
+    eor     r8, r5, r5, lsr #15
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #15     //SWAPMOVE(r5, r5, 0x0000aaaa, 15);
+    eor     r8, r7, r7, lsr #18
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #18     //SWAPMOVE(r7, r7, 0x00003333, 18);
+    eor     r8, r5, r5, lsr #18
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #18     //SWAPMOVE(r5, r5, 0x00003333, 18);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    str.w   r5, [r1, #56]
+    str.w   r7, [r1, #60]
+    // REARRANGE_RKEY_3
+    movw    r3, #0x0a0a
+    movt    r3, #0x0a0a             //r3 <- 0x0a0a0a0a
+    movw    r10, #0x00cc
+    movt    r10, #0x00cc            //r10<- 0x00cc00cc
+    ldrd    r5, r7, [r1, #24]
+    eor     r8, r7, r7, lsr #3
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #3      //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3);
+    eor     r8, r5, r5, lsr #3
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #3      //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3);
+    eor     r8, r7, r7, lsr #6
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #6      //SWAPMOVE(r7, r7, 0x00cc00cc, 6);
+    eor     r8, r5, r5, lsr #6
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #6      //SWAPMOVE(r5, r5, 0x00cc00cc, 6);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x000000ff, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x000000ff, 24);
+    strd    r5, r7, [r1, #24]
+    ldrd    r5, r7, [r1, #64]
+    eor     r8, r7, r7, lsr #3
+    and     r8, r8, r3
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #3      //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3);
+    eor     r8, r5, r5, lsr #3
+    and     r8, r8, r3
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #3      //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3);
+    eor     r8, r7, r7, lsr #6
+    and     r8, r8, r10
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #6      //SWAPMOVE(r7, r7, 0x00cc00cc, 6);
+    eor     r8, r5, r5, lsr #6
+    and     r8, r8, r10
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #6      //SWAPMOVE(r5, r5, 0x00cc00cc, 6);
+    eor     r8, r7, r7, lsr #12
+    and     r8, r8, r11
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #12     //SWAPMOVE(r7, r7, 0x000f000f, 12);
+    eor     r8, r5, r5, lsr #12
+    and     r8, r8, r11
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #12     //SWAPMOVE(r5, r5, 0x000f000f, 12);
+    eor     r8, r7, r7, lsr #24
+    and     r8, r8, #0xff
+    eor     r7, r8
+    eor     r7, r7, r8, lsl #24     //SWAPMOVE(r7, r7, 0x0000ff00, 24);
+    eor     r8, r5, r5, lsr #24
+    and     r8, r8, #0xff
+    eor     r5, r8
+    eor     r5, r5, r8, lsl #24     //SWAPMOVE(r5, r5, 0x0000ff00, 24);
+    str.w   r5, [r1, #64]
+    str.w   r7, [r1, #68]
+    //keyschedule according to the new representations
+    // KEY_DOULBE/TRIPLE_UPDATE_0
+    movw    r10, #0x3333
+    eor     r12, r10, r10, lsl #16
+    mvn     r11, r12
+    movw    r9, #0x4444
+    movt    r9, #0x5555
+    movw    r8, #0x1100
+    movt    r8, #0x5555
+    ldrd    r4, r5, [r1]
+    and     r2, r12, r4, ror #24
+    and     r4, r4, r11
+    orr     r4, r2, r4, ror #16     //KEY_TRIPLE_UPDATE_1(r4)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r8
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x55551100, 1)
+    eor     r2, r5, r5, lsr #16
+    and     r2, r2, r10
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #16     //SWAPMOVE(r5, r5, 0x00003333, 16)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r9
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x555544444, 1)
+    str.w   r5, [r1, #80]
+    str.w   r4, [r1, #84]
+    and     r2, r12, r5, ror #24
+    and     r5, r5, r11
+    orr     r5, r2, r5, ror #16     //KEY_TRIPLE_UPDATE_1(r5)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r8
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x55551100, 1)
+    eor     r2, r4, r4, lsr #16
+    and     r2, r2, r10
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #16     //SWAPMOVE(r4, r4, 0x00003333, 16)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r9
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x555544444, 1)
+    str.w   r4, [r1, #160]
+    str.w   r5, [r1, #164]
+    and     r2, r12, r4, ror #24
+    and     r4, r4, r11
+    orr     r4, r2, r4, ror #16     //KEY_TRIPLE_UPDATE_1(r4)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r8
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x55551100, 1)
+    eor     r2, r5, r5, lsr #16
+    and     r2, r2, r10
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #16     //SWAPMOVE(r5, r5, 0x00003333, 16)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r9
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x555544444, 1)
+    strd    r5, r4, [r1, #240]
+    ldrd    r4, r5, [r1, #40]
+    and     r2, r12, r4, ror #24
+    and     r4, r4, r11
+    orr     r4, r2, r4, ror #16     //KEY_TRIPLE_UPDATE_1(r4)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r8
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x55551100, 1)
+    eor     r2, r5, r5, lsr #16
+    and     r2, r2, r10
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #16     //SWAPMOVE(r5, r5, 0x00003333, 16)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r9
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x555544444, 1)
+    str.w   r5, [r1, #120]
+    str.w   r4, [r1, #124]
+    and     r2, r12, r5, ror #24
+    and     r5, r5, r11
+    orr     r5, r2, r5, ror #16     //KEY_TRIPLE_UPDATE_1(r5)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r8
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x55551100, 1)
+    eor     r2, r4, r4, lsr #16
+    and     r2, r2, r10
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #16     //SWAPMOVE(r4, r4, 0x00003333, 16)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r9
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x555544444, 1)
+    str.w   r4, [r1, #200]
+    str.w   r5, [r1, #204]
+    and     r2, r12, r4, ror #24
+    and     r4, r4, r11
+    orr     r4, r2, r4, ror #16     //KEY_TRIPLE_UPDATE_1(r4)
+    eor     r2, r4, r4, lsr #1
+    and     r2, r2, r8
+    eor     r4, r4, r2
+    eor     r4, r4, r2, lsl #1      //SWAPMOVE(r4, r4, 0x55551100, 1)
+    eor     r2, r5, r5, lsr #16
+    and     r2, r2, r10
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #16     //SWAPMOVE(r5, r5, 0x00003333, 16)
+    eor     r2, r5, r5, lsr #1
+    and     r2, r2, r9
+    eor     r5, r5, r2
+    eor     r5, r5, r2, lsl #1      //SWAPMOVE(r5, r5, 0x555544444, 1)
+    str.w   r5, [r1, #280]
+    str.w   r4, [r1, #284]
+    // KEY_DOULBE/TRIPLE_UPDATE_2
+    // masks
+    movw    r12, #0x0f00
+    movt    r12, #0x0f00
+    movw    r11, #0x0003
+    movt    r11, #0x0003
+    movw    r10, #0x003f
+    movt    r10, #0x003f
+    lsl     r9, r11, #8             //r9 <- 0x03000300
+    and     r8, r10, r10, lsr #3    //r8 <- 0x00070007
+    orr     r7, r8, r8, lsl #2      //r7 <- 0x001f001f
+    ldrd    r4, r5, [r1, #8]
+    and     r2, r9, r4, lsr #6
+    and     r3, r4, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #5
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r12, r5, lsr #4
+    and     r3, r5, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r5, lsr #6
+    orr     r2, r2, r3
+    and     r5, r5, r10
+    orr     r5, r2, r5, lsl #2      //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r5, [r1, #88]
+    str.w   r4, [r1, #92]
+    and     r2, r9, r5, lsr #6
+    and     r3, r5, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #5
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_TRIPLE_UPDATE_2(r5)
+    and     r2, r12, r4, lsr #4
+    and     r3, r4, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r4, lsr #6
+    orr     r2, r2, r3
+    and     r4, r4, r10
+    orr     r4, r2, r4, lsl #2      //KEY_DOUBLE_UPDATE_2(r4)
+    str.w   r4, [r1, #168]
+    str.w   r5, [r1, #172]
+    and     r2, r9, r4, lsr #6
+    and     r3, r4, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #5
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r12, r5, lsr #4
+    and     r3, r5, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r5, lsr #6
+    orr     r2, r2, r3
+    and     r5, r5, r10
+    orr     r5, r2, r5, lsl#2       //KEY_DOUBLE_UPDATE_2(r5)
+    strd    r5, r4, [r1, #248]
+    ldrd    r4, r5, [r1, #48]
+    and     r2, r9, r4, lsr #6
+    and     r3, r4, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #5
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r12, r5, lsr #4
+    and     r3, r5, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r5, lsr #6
+    orr     r2, r2, r3
+    and     r5, r5, r10
+    orr     r5, r2, r5, lsl #2      //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r5, [r1, #128]
+    str.w   r4, [r1, #132]
+    and     r2, r9, r5, lsr #6
+    and     r3, r5, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #5
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_TRIPLE_UPDATE_2(r5)
+    and     r2, r12, r4, lsr #4
+    and     r3, r4, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r4, lsr #6
+    orr     r2, r2, r3
+    and     r4, r4, r10
+    orr     r4, r2, r4, lsl #2      //KEY_DOUBLE_UPDATE_2(r4)
+    str.w   r4, [r1, #208]
+    str.w   r5, [r1, #212]
+    and     r2, r9, r4, lsr #6
+    and     r3, r4, r10, lsl #8
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #5
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r12, r5, lsr #4
+    and     r3, r5, r12
+    orr     r2, r2, r3, lsl #4
+    and     r3, r11, r5, lsr #6
+    orr     r2, r2, r3
+    and     r5, r5, r10
+    orr     r5, r2, r5, lsl#2       //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r5, [r1, #288]
+    str.w   r4, [r1, #292]
+    // KEY_DOULBE/TRIPLE_UPDATE_2
+    // masks
+    movw    r12, #0x5555
+    movt    r12, #0x5555
+    mvn     r11, r12
+    ldrd    r4, r5, [r1, #16]
+    and     r2, r12, r4, ror #24
+    and     r4, r11, r4, ror #20
+    orr     r4, r4, r2              //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r11, r5, ror #24
+    and     r5, r12, r5, ror #16
+    orr     r5, r5, r2              //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r5, [r1, #96]
+    str.w   r4, [r1, #100]
+    and     r2, r12, r5, ror #24
+    and     r5, r11, r5, ror #20
+    orr     r5, r5, r2              //KEY_TRIPLE_UPDATE_2(r5)
+    and     r2, r11, r4, ror #24
+    and     r4, r12, r4, ror #16
+    orr     r4, r4, r2              //KEY_DOUBLE_UPDATE_2(r4)
+    str.w   r4, [r1, #176]
+    str.w   r5, [r1, #180]
+    and     r2, r12, r4, ror #24
+    and     r4, r11, r4, ror #20
+    orr     r4, r4, r2              //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r11, r5, ror #24
+    and     r5, r12, r5, ror #16
+    orr     r5, r5, r2              //KEY_DOUBLE_UPDATE_2(r5)
+    strd    r5, r4, [r1, #256]
+    ldrd    r4, r5, [r1, #56]
+    and     r2, r12, r4, ror #24
+    and     r4, r11, r4, ror #20
+    orr     r4, r4, r2              //KEY_TRIPLE_UPDATE_2(r5)
+    and     r2, r11, r5, ror #24
+    and     r5, r12, r5, ror #16
+    orr     r5, r5, r2              //KEY_DOUBLE_UPDATE_2(r4)
+    str.w   r5, [r1, #136]
+    str.w   r4, [r1, #140]
+    and     r2, r12, r5, ror #24
+    and     r5, r11, r5, ror #20
+    orr     r5, r5, r2              //KEY_TRIPLE_UPDATE_2(r4)
+    and     r2, r11, r4, ror #24
+    and     r4, r12, r4, ror #16
+    orr     r4, r4, r2              //KEY_DOUBLE_UPDATE_2(r5)
+    str.w   r4, [r1, #216]
+    str.w   r5, [r1, #220]
+    and     r2, r12, r4, ror #24
+    and     r4, r11, r4, ror #20
+    orr     r4, r4, r2              //KEY_TRIPLE_UPDATE_2(r5)
+    and     r2, r11, r5, ror #24
+    and     r5, r12, r5, ror #16
+    orr     r5, r5, r2              //KEY_DOUBLE_UPDATE_2(r4)
+    str.w   r5, [r1, #296]
+    str.w   r4, [r1, #300]
+    // KEY_DOULBE/TRIPLE_UPDATE_3
+    // masks
+    orr     r12, r8, r8, lsl #8     //r12<- 0x07070707
+    movw    r11, #0xc0c0
+    movw    r10, #0x3030
+    and     r9, r12, r12, lsr #1    //r9 <- 0x03030303
+    lsl     r8, r12, #4
+    eor     r7, r8, r9, lsl #5
+    movw    r6, #0xf0f0
+    ldrd    r4, r5, [r1, #24]
+    and     r2, r10, r4, lsr #18
+    and     r3, r4, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r4, lsr #14
+    orr     r2, r2, r3
+    and     r3, r4, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7, lsr #16
+    orr     r4, r2, r4, lsl #19     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r9, r5, lsr #2
+    and     r3, r9, r5
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r5, [r1, #104]
+    str.w   r4, [r1, #108]
+    and     r2, r10, r5, lsr #18
+    and     r3, r5, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r5, lsr #14
+    orr     r2, r2, r3
+    and     r3, r5, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7, lsr #16
+    orr     r5, r2, r5, lsl #19     //KEY_TRIPLE_UPDATE_4(r5)
+    and     r2, r9, r4, lsr #2
+    and     r3, r9, r4
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_DOUBLE_UPDATE_4(r4)
+    str.w   r4, [r1, #184]
+    str.w   r5, [r1, #188]
+    and     r2, r10, r4, lsr #18
+    and     r3, r4, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r4, lsr #14
+    orr     r2, r2, r3
+    and     r3, r4, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7, lsr #16
+    orr     r4, r2, r4, lsl #19     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r9, r5, lsr #2
+    and     r3, r9, r5
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_DOUBLE_UPDATE_4(r5)
+    strd    r5, r4, [r1, #264]
+    ldrd    r4, r5, [r1, #64]
+    and     r2, r10, r4, lsr #18
+    and     r3, r4, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r4, lsr #14
+    orr     r2, r2, r3
+    and     r3, r4, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7, lsr #16
+    orr     r4, r2, r4, lsl #19     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r9, r5, lsr #2
+    and     r3, r9, r5
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r5, [r1, #144]
+    str.w   r4, [r1, #148]
+    and     r2, r10, r5, lsr #18
+    and     r3, r5, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r5, lsr #14
+    orr     r2, r2, r3
+    and     r3, r5, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7, lsr #16
+    orr     r5, r2, r5, lsl #19     //KEY_TRIPLE_UPDATE_4(r5)
+    and     r2, r9, r4, lsr #2
+    and     r3, r9, r4
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7
+    orr     r4, r2, r4, lsl #3      //KEY_DOUBLE_UPDATE_4(r4)
+    str.w   r4, [r1, #224]
+    str.w   r5, [r1, #228]
+    and     r2, r10, r4, lsr #18
+    and     r3, r4, r7, lsr #4
+    orr     r2, r2, r3, lsl #3
+    and     r3, r11, r4, lsr #14
+    orr     r2, r2, r3
+    and     r3, r4, r12, lsr #11
+    orr     r2, r2, r3, lsl #15
+    and     r3, r12, r4, lsr #1
+    orr     r2, r2, r3
+    and     r4, r4, r7, lsr #16
+    orr     r4, r2, r4, lsl #19     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r9, r5, lsr #2
+    and     r3, r9, r5
+    orr     r2, r2, r3, lsl #2
+    and     r3, r8, r5, lsr #1
+    orr     r2, r2, r3
+    and     r5, r5, r7
+    orr     r5, r2, r5, lsl #3      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r5, [r1, #304]
+    str.w   r4, [r1, #308]
+    // KEY_DOULBE/TRIPLE_UPDATE_4
+    // masks
+    movw    r12, #0x0fff
+    lsl     r10, r12, #16
+    movw    r8, #0x00ff
+    movw    r7, #0x03ff
+    lsl     r7, r7, #16
+    ldrd    r4, r5, [r1, #32]
+    and     r2, r7, r4, lsr #6
+    and     r3, r4, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r4, lsr #4
+    orr     r2, r2, r3
+    and     r4, r4, #0x000f
+    orr     r4, r2, r4, lsl #12     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r10, r5, lsr #4
+    and     r3, r5, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r5, lsr #8
+    orr     r2, r2, r3
+    and     r5, r5, r8
+    orr     r5, r2, r5, lsl #8      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r5, [r1, #112]
+    str.w   r4, [r1, #116]
+    and     r2, r7, r5, lsr #6
+    and     r3, r5, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r5, lsr #4
+    orr     r2, r2, r3
+    and     r5, r5, #0x000f
+    orr     r5, r2, r5, lsl #12     //KEY_TRIPLE_UPDATE_4(r5)
+    and     r2, r10, r4, lsr #4
+    and     r3, r4, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r4, lsr #8
+    orr     r2, r2, r3
+    and     r4, r4, r8
+    orr     r4, r2, r4, lsl #8      //KEY_DOUBLE_UPDATE_4(r4)
+    str.w   r4, [r1, #192]
+    str.w   r5, [r1, #196]
+    and     r2, r7, r4, lsr #6
+    and     r3, r4, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r4, lsr #4
+    orr     r2, r2, r3
+    and     r4, r4, #0x000f
+    orr     r4, r2, r4, lsl #12     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r10, r5, lsr #4
+    and     r3, r5, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r5, lsr #8
+    orr     r2, r2, r3
+    and     r5, r5, r8
+    orr     r5, r2, r5, lsl #8      //KEY_DOUBLE_UPDATE_4(r5)
+    strd    r5, r4, [r1, #272]
+    ldrd    r4, r5, [r1, #72]
+    and     r2, r7, r4, lsr #6
+    and     r3, r4, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r4, lsr #4
+    orr     r2, r2, r3
+    and     r4, r4, #0x000f
+    orr     r4, r2, r4, lsl #12     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r10, r5, lsr #4
+    and     r3, r5, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r5, lsr #8
+    orr     r2, r2, r3
+    and     r5, r5, r8
+    orr     r5, r2, r5, lsl #8      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r5, [r1, #152]
+    str.w   r4, [r1, #156]
+    and     r2, r7, r5, lsr #6
+    and     r3, r5, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r5, lsr #4
+    orr     r2, r2, r3
+    and     r5, r5, #0x000f
+    orr     r5, r2, r5, lsl #12     //KEY_TRIPLE_UPDATE_4(r5)
+    and     r2, r10, r4, lsr #4
+    and     r3, r4, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r4, lsr #8
+    orr     r2, r2, r3
+    and     r4, r4, r8
+    orr     r4, r2, r4, lsl #8      //KEY_DOUBLE_UPDATE_4(r4)
+    str.w   r4, [r1, #232]
+    str.w   r5, [r1, #236]
+    and     r2, r7, r4, lsr #6
+    and     r3, r4, #0x003f0000
+    orr     r2, r2, r3, lsl #10
+    and     r3, r12, r4, lsr #4
+    orr     r2, r2, r3
+    and     r4, r4, #0x000f
+    orr     r4, r2, r4, lsl #12     //KEY_TRIPLE_UPDATE_4(r4)
+    and     r2, r10, r5, lsr #4
+    and     r3, r5, #0x000f0000
+    orr     r2, r2, r3, lsl #12
+    and     r3, r8, r5, lsr #8
+    orr     r2, r2, r3
+    and     r5, r5, r8
+    orr     r5, r2, r5, lsl #8      //KEY_DOUBLE_UPDATE_4(r5)
+    str.w   r5, [r1, #312]
+    str.w   r4, [r1, #316]
+    pop     {r2-r12,r14}
+    bx      lr
+
+/*****************************************************************************
+* Fully unrolled ARM assembly implementation of the GIFTb-128 block cipher.
+* This function simply encrypts a 128-bit block, without any operation mode.
+*****************************************************************************/
+@ void giftb128_encrypt_block(u8 *out, const u32* rkey, const u8 *block)
+.global giftb128_encrypt_block
+.type   giftb128_encrypt_block,%function
+giftb128_encrypt_block:
+    push {r2-r12,r14}
+    // load plaintext blocks
+    ldm     r2, {r9-r12}
+    // endianness
+    rev     r9, r9
+    rev     r10, r10
+    rev     r11, r11
+    rev     r12, r12
+    // masks for HALF/BYTE/NIBBLE rotations
+    movw    r2, #0x1111
+    movt    r2, #0x1111 //for NIBBLE_ROR
+    movw    r3, #0x000f
+    movt    r3, #0x000f //for HALF_ROR
+    mvn     r4, r2, lsl #3 //0x7777777 for NIBBLE_ROR
+    // ------------------ 1st QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x0008
+    movt    r5, 0x1000 //load rconst
+    ldrd    r6, r7, [r1] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    and     r8, r4, r12, lsr #1
+    and     r12, r12, r2
+    orr     r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0x8000
+    movt    r5, 0x8001 //load rconst
+    ldrd    r6, r7, [r1, #8] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r9, lsr #4
+    and     r9, r9, r3
+    orr     r9, r8, r9, lsl #12 //HALF_ROR(r9, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x5400 //load rconst
+    ldrd    r6, r7, [r1, #16] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r12, r12, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r12, r12, r8
+    eor     r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x55550000, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x00005555, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0181
+    movt    r5, 0x0101 //load rconst
+    ldrd    r6, r7, [r1, #24] //load rkey
+    and     r8, r11, r12, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #16
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r9, lsr #6
+    and     r9, r14, r9
+    orr     r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x001f
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #32] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+
+    // ------------------ 2nd QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x8880
+    movt    r5, 0x1088 //load rconst
+    ldrd    r6, r7, [r1, #40] //load rkey
+    and     r8, r11, r12, ror #24 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #24
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    and     r8, r4, r9, lsr #1
+    and     r9, r9, r2
+    orr     r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0xe000
+    movt    r5, 0x6001 //load rconst
+    ldrd    r6, r7, [r1, #48] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r12, lsr #4
+    and     r12, r12, r3
+    orr     r12, r8, r12, lsl #12 //HALF_ROR(r12, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x5150 //load rconst
+    ldrd    r6, r7, [r1, #56] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r9, r9, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r9, r9, r8
+    eor     r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0180
+    movt    r5, 0x0303 //load rconst
+    ldrd    r6, r7, [r1, #64] //load rkey
+    and     r8, r11, r9, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r8, r9, ror #16
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r12, lsr #6
+    and     r12, r14, r12
+    orr     r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x002f
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #72] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+
+    // ------------------ 3rd QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x8880
+    movt    r5, 0x1008 //load rconst
+    ldrd    r6, r7, [r1, #80] //load rkey
+    and     r8, r11, r9, ror #24 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r8, r9, ror #24
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    and     r8, r4, r12, lsr #1
+    and     r12, r12, r2
+    orr     r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0x6000
+    movt    r5, 0x6001 //load rconst
+    ldrd    r6, r7, [r1, #88] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r9, lsr #4
+    and     r9, r9, r3
+    orr     r9, r8, r9, lsl #12 //HALF_ROR(r9, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x4150 //load rconst
+    ldrd    r6, r7, [r1, #96] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r12, r12, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r12, r12, r8
+    eor     r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x00005555, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0080
+    movt    r5, 0x0303 //load rconst
+    ldrd    r6, r7, [r1, #104] //load rkey
+    and     r8, r11, r12, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #16
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r9, lsr #6
+    and     r9, r14, r9
+    orr     r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x0027
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #112] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+
+    // ------------------ 4th QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x8880
+    movt    r5, 0x1000 //load rconst
+    ldrd    r6, r7, [r1, #120] //load rkey
+    and     r8, r11, r12, ror #24 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #24
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    and     r8, r4, r9, lsr #1
+    and     r9, r9, r2
+    orr     r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0xe000
+    movt    r5, 0x4001 //load rconst
+    ldrd    r6, r7, [r1, #128] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r12, lsr #4
+    and     r12, r12, r3
+    orr     r12, r8, r12, lsl #12 //HALF_ROR(r12, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x1150 //load rconst
+    ldrd    r6, r7, [r1, #136] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r9, r9, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r9, r9, r8
+    eor     r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0180
+    movt    r5, 0x0302 //load rconst
+    ldrd    r6, r7, [r1, #144] //load rkey
+    and     r8, r11, r9, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r8, r9, ror #16
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r12, lsr #6
+    and     r12, r14, r12
+    orr     r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x002b
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #152] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+
+    // ------------------ 5th QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x0880
+    movt    r5, 0x1008 //load rconst
+    ldrd    r6, r7, [r1, #160] //load rkey
+    and     r8, r11, r9, ror #24 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r8, r9, ror #24
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    and     r8, r4, r12, lsr #1
+    and     r12, r12, r2
+    orr     r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0x4000
+    movt    r5, 0x6001 //load rconst
+    ldrd    r6, r7, [r1, #168] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r9, lsr #4
+    and     r9, r9, r3
+    orr     r9, r8, r9, lsl #12 //HALF_ROR(r9, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x0140 //load rconst
+    ldrd    r6, r7, [r1, #176] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r12, r12, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r12, r12, r8
+    eor     r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x00005555, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0080
+    movt    r5, 0x0202 //load rconst
+    ldrd    r6, r7, [r1, #184] //load rkey
+    and     r8, r11, r12, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #16
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r9, lsr #6
+    and     r9, r14, r9
+    orr     r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x0021
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #192] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+
+    // ------------------ 6th QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x0080
+    movt    r5, 0x1000 //load rconst
+    ldrd    r6, r7, [r1, #200] //load rkey
+    and     r8, r11, r12, ror #24 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #24
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    and     r8, r4, r9, lsr #1
+    and     r9, r9, r2
+    orr     r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0xc000
+    movt    r5, 0x0001 //load rconst
+    ldrd    r6, r7, [r1, #208] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r12, lsr #4
+    and     r12, r12, r3
+    orr     r12, r8, r12, lsl #12 //HALF_ROR(r12, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x5100 //load rconst
+    ldrd    r6, r7, [r1, #216] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r9, r9, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r9, r9, r8
+    eor     r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0180
+    movt    r5, 0x0301 //load rconst
+    ldrd    r6, r7, [r1, #224] //load rkey
+    and     r8, r11, r9, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r8, r9, ror #16
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r12, lsr #6
+    and     r12, r14, r12
+    orr     r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x002e
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #232] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+
+
+    // ------------------ 7th QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x8800
+    movt    r5, 0x1008 //load rconst
+    ldrd    r6, r7, [r1, #240] //load rkey
+    and     r8, r11, r9, ror #24 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r8, r9, ror #24
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    and     r8, r4, r12, lsr #1
+    and     r12, r12, r2
+    orr     r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0x2000
+    movt    r5, 0x6001 //load rconst
+    ldrd    r6, r7, [r1, #248] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r9, lsr #4
+    and     r9, r9, r3
+    orr     r9, r8, r9, lsl #12 //HALF_ROR(r9, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x4050 //load rconst
+    ldrd    r6, r7, [r1, #256] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r12, r12, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r12, r12, r8
+    eor     r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x00005555, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0080
+    movt    r5, 0x0103 //load rconst
+    ldrd    r6, r7, [r1, #264] //load rkey
+    and     r8, r11, r12, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #16
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r9, lsr #6
+    and     r9, r14, r9
+    orr     r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x0006
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #272] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+
+    // ------------------ 8th QUINTUPLE ROUND ------------------
+    // 1st round
+    movw    r5, 0x8808
+    movt    r5, 0x1000 //load rconst
+    ldrd    r6, r7, [r1, #280] //load rkey
+    and     r8, r11, r12, ror #24 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r8, r12, ror #24
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    and     r8, r4, r9, lsr #1
+    and     r9, r9, r2
+    orr     r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1)
+    and     r8, r4, r11
+    and     r11, r2, r11, lsr #3
+    orr     r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3)
+    orr     r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR
+    and     r8, r14, r10, lsr #2
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 2nd round
+    movw    r5, 0xa000
+    movt    r5, 0xc001 //load rconst
+    ldrd    r6, r7, [r1, #288] //load rkey
+    and     r8, r9, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r9, r8
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    mvn     r14, r3, lsl #12 //0x0fff0fff for HALF_ROR
+    and     r8, r14, r12, lsr #4
+    and     r12, r12, r3
+    orr     r12, r8, r12, lsl #12 //HALF_ROR(r12, 4)
+    and     r8, r3, r11, lsr #12
+    and     r11, r11, r14
+    orr     r11, r8, r11, lsl #4 //HALF_ROR(r11, 12)
+    rev16   r10, r10 //HALF_ROR(r10, 8)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 3rd round
+    movw    r5, 0x0002
+    movt    r5, 0x1450 //load rconst
+    ldrd    r6, r7, [r1, #296] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9
+    orr     r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE
+    eor     r8, r10, r10, lsr #1
+    and     r8, r8, r14
+    eor     r10, r10, r8
+    eor     r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1)
+    eor     r8, r9, r9, lsr #1
+    and     r8, r8, r14, lsr #16
+    eor     r9, r9, r8
+    eor     r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1)
+    eor     r8, r11, r11, lsr #1
+    and     r8, r8, r14, lsl #16
+    eor     r11, r11, r8
+    eor     r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r7, r11, ror #16 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // 4th round
+    movw    r5, 0x0181
+    movt    r5, 0x0102 //load rconst
+    ldrd    r6, r7, [r1, #304] //load rkey
+    and     r8, r11, r9, ror #16 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r12
+    eor     r9, r8, r9, ror #16
+    orr     r8, r9, r10
+    eor     r11, r11, r8
+    eor     r12, r12, r11
+    eor     r10, r10, r12
+    and     r8, r9, r10
+    eor     r11, r11, r8
+    mvn     r12, r12
+    eor     r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR
+    and     r8, r14, r10, lsr #4
+    and     r10, r10, r14
+    orr     r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4)
+    orr     r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR
+    mvn     r8, r14
+    and     r8, r8, r11, lsl #6
+    and     r11, r14, r11, lsr #2
+    orr     r11, r11, r8 //BYTE_ROR(r11, 2)
+    mvn     r8, r14, lsr #6
+    and     r8, r8, r12, lsr #6
+    and     r12, r14, r12
+    orr     r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6)
+    eor     r10, r10, r6 //add 1st keyword
+    eor     r11, r11, r7 //add 2nd keyword
+    eor     r9, r9, r5 //add     rconst
+    // 5th round
+    movw    r5, 0x001a
+    movt    r5, 0x8000 //load rconst
+    ldrd    r6, r7, [r1, #312] //load rkey
+    and     r8, r12, r11 //sbox layer
+    eor     r10, r10, r8
+    and     r8, r10, r9
+    eor     r12, r12, r8
+    orr     r8, r12, r10
+    eor     r11, r11, r8
+    eor     r9, r9, r11
+    eor     r10, r10, r9
+    and     r8, r12, r10
+    eor     r11, r11, r8
+    mvn     r9, r9, ror #24
+    eor     r10, r6, r10, ror #16 //add 1st keyword
+    eor     r11, r7, r11, ror #8 //add 2nd keyword
+    eor     r12, r12, r5 //add     rconst
+    // endianness
+    rev     r9, r9
+    rev     r10, r10
+    rev     r11, r11
+    rev     r12, r12
+    stm     r0, {r9-r12}
+    pop     {r2-r12,r14}
+    bx      lr
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/api.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/api.h
+#define CRYPTO_KEYBYTES     16
+#define CRYPTO_NSECBYTES    0
+#define CRYPTO_NPUBBYTES    16
+#define CRYPTO_ABYTES       16
+#define CRYPTO_NOOVERLAP    1
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/cofb.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/cofb.h
+#ifndef COFB_H_
+#define COFB_H_
+
+#define DOUBLE_HALF_BLOCK(x) ({                                             \
+	tmp0 = (x)[0];                                                          \
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);  \
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;                                  \
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);  \
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;                               \
+})
+
+#define TRIPLE_HALF_BLOCK(x) ({                                             \
+	tmp0 = (x)[0];															\
+	tmp1 = (x)[1];															\
+    (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15);	\
+    (x)[0] |= ((x)[1] & 0x80808080) << 17;									\
+    (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15);	\
+    (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24;								\
+    (x)[0] ^= tmp0;															\
+    (x)[1] ^= tmp1;															\
+})
+
+#define G(x) ({                                                             \
+	tmp0 = (x)[0];                                                          \
+	tmp1 = (x)[1];                                                          \
+	(x)[0] = (x)[2];														\
+	(x)[1] = (x)[3];														\
+    (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15);      \
+    (x)[2] |= ((tmp1 & 0x80808080) << 17);								    \
+    (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15);      \
+    (x)[3] |= ((tmp0 & 0x80808080) << 17);									\
+})
+
+#define XOR_BLOCK(x, y, z) ({       \
+    (x)[0] = (y)[0] ^ (z)[0];       \
+    (x)[1] = (y)[1] ^ (z)[1];       \
+    (x)[2] = (y)[2] ^ (z)[2];       \
+    (x)[3] = (y)[3] ^ (z)[3];       \
+})
+
+#define XOR_TOP_BAR_BLOCK(x, y) ({  \
+    (x)[0] ^= (y)[0];               \
+    (x)[1] ^= (y)[1];               \
+})
+
+#define RHO1(d, y, m, n) ({         \
+    G(y);                           \
+    padding(d,m,n);                 \
+    XOR_BLOCK(d, d, y);             \
+})
+
+#define RHO(y, m, x, c, n) ({       \
+    XOR_BLOCK(c, y, m);				\
+    RHO1(x, y, m, n);				\
+})
+
+#define RHO_PRIME(y, c, x, m, n) ({ \
+    XOR_BLOCK(m, y, c);             \
+    RHO1(x, y, m, n);               \
+})
+
+#endif // COFB_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/encrypt.c
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/encrypt.c
+/*******************************************************************************
+* Constant-time 32-bit implementation of the GIFT-COFB authenticated cipher.
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+* @date     January 2020
+*******************************************************************************/
+#include <string.h> //for memcpy
+#include "api.h"
+#include "cofb.h"
+#include "giftb128.h"
+
+#define TAGBYTES        CRYPTO_ABYTES
+#define BLOCKBYTES      CRYPTO_ABYTES
+#define COFB_ENCRYPT    1
+#define COFB_DECRYPT    0
+
+/****************************************************************************
+* 32-bit padding implementation.
+****************************************************************************/
+static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){
+    u32 i;
+    if (no_of_bytes == 0) {
+        d[0] = 0x00000080; // little-endian
+        d[1] = 0x00000000;
+        d[2] = 0x00000000;
+        d[3] = 0x00000000;
+    }
+    else if (no_of_bytes < BLOCKBYTES) {
+        for (i = 0; i < no_of_bytes/4+1; i++)
+            d[i] = s[i];
+        d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8);
+        d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8;
+        for (; i < 4; i++)
+            d[i] = 0x00000000;
+    }
+    else {
+        d[0] = s[0];
+        d[1] = s[1];
+        d[2] = s[2];
+        d[3] = s[3];
+    }
+}
+
+/****************************************************************************
+* Constant-time implementation of the GIFT-COFB authenticated cipher based on
+* fixsliced GIFTb-128. Encryption/decryption is handled by the same function,
+* depending on the 'encrypting' parameter (1/0).
+****************************************************************************/
+int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad,
+                u32 ad_len, const u8* in, u32 in_len, const int encrypting) {
+
+    u32 tmp0, tmp1, emptyA, emptyM;
+    u32 offset[2], input[4], rkey[80];
+    u8 Y[16];
+
+    if (!encrypting) {
+        if (in_len < TAGBYTES)
+            return -1;
+        in_len -= TAGBYTES;
+    }
+
+    if (ad_len == 0)
+        emptyA = 1;
+    else
+        emptyA = 0;
+
+    if (in_len == 0)
+        emptyM =1;
+    else
+        emptyM = 0;
+
+    precompute_rkeys(rkey, key);
+    giftb128(Y, nonce, rkey);
+    offset[0] = ((u32*)Y)[0];
+    offset[1] = ((u32*)Y)[1];
+
+    while (ad_len > BLOCKBYTES) {
+        RHO1(input, (u32*)Y, (u32*)ad, BLOCKBYTES);
+        DOUBLE_HALF_BLOCK(offset);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128(Y, (u8*)input, rkey);
+        ad += BLOCKBYTES;
+        ad_len -= BLOCKBYTES;
+    }
+    
+    TRIPLE_HALF_BLOCK(offset);
+    if ((ad_len % BLOCKBYTES != 0) || (emptyA))
+        TRIPLE_HALF_BLOCK(offset);
+    if (emptyM) {
+        TRIPLE_HALF_BLOCK(offset);
+        TRIPLE_HALF_BLOCK(offset);
+    }
+
+    RHO1(input, (u32*)Y, (u32*)ad, ad_len);
+    XOR_TOP_BAR_BLOCK(input, offset);
+    giftb128(Y, (u8*)input, rkey);
+
+    while (in_len > BLOCKBYTES) {
+        DOUBLE_HALF_BLOCK(offset);
+        if (encrypting)
+            RHO((u32*)Y, (u32*)in, input, (u32*)out, BLOCKBYTES);
+        else
+            RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, BLOCKBYTES);
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128(Y, (u8*)input, rkey);
+        in += BLOCKBYTES;
+        out += BLOCKBYTES;
+        in_len -= BLOCKBYTES;
+    }
+    
+    if (!emptyM) {
+        TRIPLE_HALF_BLOCK(offset);
+        if(in_len % BLOCKBYTES != 0)
+            TRIPLE_HALF_BLOCK(offset);
+        if (encrypting) {
+            RHO((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            out += in_len;
+        }
+        else {
+            RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, in_len);
+            in += in_len;
+        }
+        XOR_TOP_BAR_BLOCK(input, offset);
+        giftb128(Y, (u8*)input, rkey);
+    }
+
+    if (encrypting) {
+        memcpy(out, Y, TAGBYTES);
+        return 0;
+    }
+    // decrypting
+    tmp0 = 0;
+    for(tmp1 = 0; tmp1 < TAGBYTES; tmp1++)
+        tmp0 |= in[tmp1] ^ Y[tmp1];
+    return tmp0;
+}
+
+/****************************************************************************
+* API required by the NIST for the LWC competition.
+****************************************************************************/
+int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
+                    const unsigned char* m, unsigned long long mlen,
+                    const unsigned char* ad, unsigned long long adlen,
+                    const unsigned char* nsec, const unsigned char* npub,
+                    const unsigned char* k) {
+    (void)nsec;
+    *clen = mlen + TAGBYTES;
+    return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT);
+}
+
+/****************************************************************************
+* API required by the NIST for the LWC competition.
+****************************************************************************/
+int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen,
+                    unsigned char* nsec, const unsigned char* c,
+                    unsigned long long clen, const unsigned char* ad,
+                    unsigned long long adlen, const unsigned char* npub,
+                    const unsigned char *k) {
+    (void)nsec;
+    *mlen = clen - TAGBYTES;
+    return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT);
+}
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/endian.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/endian.h
+#ifndef ENDIAN_H_
+#define ENDIAN_H_
+
+#define U32BIG(x)											\
+  ((((x) & 0x000000FF) << 24) | (((x) & 0x0000FF00) << 8) | \
+   (((x) & 0x00FF0000) >> 8) | (((x) & 0xFF000000) >> 24))
+
+#define U8BIG(x, y)											\
+	(x)[0] = (y) >> 24; 									\
+	(x)[1] = ((y) >> 16) & 0xff; 							\
+	(x)[2] = ((y) >> 8) & 0xff; 							\
+	(x)[3] = (y) & 0xff;
+
+#endif  // ENDIAN_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.c
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.c
+/*******************************************************************************
+* Optimized constant-time implementation of the GIFTb-128 block cipher.
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     January 2020
+*******************************************************************************/
+#include "endian.h"
+#include "giftb128.h"
+#include "key_schedule.h"
+
+/*****************************************************************************
+* The round constants according to the fixsliced representation.
+*****************************************************************************/
+const u32 rconst[40] = {
+    0x10000008, 0x80018000, 0x54000002, 0x01010181,
+    0x8000001f, 0x10888880, 0x6001e000, 0x51500002,
+    0x03030180, 0x8000002f, 0x10088880, 0x60016000,
+    0x41500002, 0x03030080, 0x80000027, 0x10008880,
+    0x4001e000, 0x11500002, 0x03020180, 0x8000002b,
+    0x10080880, 0x60014000, 0x01400002, 0x02020080,
+    0x80000021, 0x10000080, 0x0001c000, 0x51000002,
+    0x03010180, 0x8000002e, 0x10088800, 0x60012000,
+    0x40500002, 0x01030080, 0x80000006, 0x10008808,
+    0xc001a000, 0x14500002, 0x01020181, 0x8000001a
+};
+
+/*****************************************************************************
+* The first 20 rkeys are computed using the classical representation before
+* being rearranged into fixsliced representations depending on round numbers.
+* The 60 remaining rkeys are directly computed in fixscliced representations.
+*****************************************************************************/
+void precompute_rkeys(u32* rkey, const u8* key) {
+    u32 tmp;
+    //classical initialization
+    rkey[0] = U32BIG(((u32*)key)[3]);
+    rkey[1] = U32BIG(((u32*)key)[1]);
+    rkey[2] = U32BIG(((u32*)key)[2]);
+    rkey[3] = U32BIG(((u32*)key)[0]);
+    // classical keyschedule
+    for(int i = 0; i < 16; i+=2) {
+        rkey[i+4] = rkey[i+1];
+        rkey[i+5] = KEY_UPDATE(rkey[i]);
+    }
+    // transposition to fixsliced representations
+    for(int i = 0; i < 20; i+=10) {
+        rkey[i] = REARRANGE_RKEY_0(rkey[i]);
+        rkey[i + 1] = REARRANGE_RKEY_0(rkey[i + 1]);
+        rkey[i + 2] = REARRANGE_RKEY_1(rkey[i + 2]);
+        rkey[i + 3] = REARRANGE_RKEY_1(rkey[i + 3]);
+        rkey[i + 4] = REARRANGE_RKEY_2(rkey[i + 4]);
+        rkey[i + 5] = REARRANGE_RKEY_2(rkey[i + 5]);
+        rkey[i + 6] = REARRANGE_RKEY_3(rkey[i + 6]);
+        rkey[i + 7] = REARRANGE_RKEY_3(rkey[i + 7]);
+    }
+    // keyschedule according to fixsliced representations
+    for(int i = 20; i < 80; i+=10) {
+        rkey[i] = rkey[i-19];
+        rkey[i+1] = KEY_TRIPLE_UPDATE_0(rkey[i-20]);
+        rkey[i+2] = KEY_DOUBLE_UPDATE_1(rkey[i-17]);
+        rkey[i+3] = KEY_TRIPLE_UPDATE_1(rkey[i-18]);
+        rkey[i+4] = KEY_DOUBLE_UPDATE_2(rkey[i-15]);
+        rkey[i+5] = KEY_TRIPLE_UPDATE_2(rkey[i-16]);
+        rkey[i+6] = KEY_DOUBLE_UPDATE_3(rkey[i-13]);
+        rkey[i+7] = KEY_TRIPLE_UPDATE_3(rkey[i-14]);
+        rkey[i+8] = KEY_DOUBLE_UPDATE_4(rkey[i-11]);
+        rkey[i+9] = KEY_TRIPLE_UPDATE_4(rkey[i-12]);
+        SWAPMOVE(rkey[i], rkey[i], 0x00003333, 16);
+        SWAPMOVE(rkey[i], rkey[i], 0x55554444, 1);
+        SWAPMOVE(rkey[i+1], rkey[i+1], 0x55551100, 1);
+    }
+}
+
+/*****************************************************************************
+* Encryption of a single 128-bit block with GIFTb-128 (used in GIFT-COFB).
+*****************************************************************************/
+void giftb128(u8* ctext, const u8* ptext, const u32* rkey) {
+    u32 tmp, state[4];
+    state[0] = U32BIG(((u32*)ptext)[0]);
+    state[1] = U32BIG(((u32*)ptext)[1]);
+    state[2] = U32BIG(((u32*)ptext)[2]);
+    state[3] = U32BIG(((u32*)ptext)[3]);
+    QUINTUPLE_ROUND(state, rkey, rconst);
+    QUINTUPLE_ROUND(state, rkey + 10, rconst + 5);
+    QUINTUPLE_ROUND(state, rkey + 20, rconst + 10);
+    QUINTUPLE_ROUND(state, rkey + 30, rconst + 15);
+    QUINTUPLE_ROUND(state, rkey + 40, rconst + 20);
+    QUINTUPLE_ROUND(state, rkey + 50, rconst + 25);
+    QUINTUPLE_ROUND(state, rkey + 60, rconst + 30);
+    QUINTUPLE_ROUND(state, rkey + 70, rconst + 35);
+    U8BIG(ctext, state[0]);
+    U8BIG(ctext + 4, state[1]);
+    U8BIG(ctext + 8, state[2]);
+    U8BIG(ctext + 12, state[3]);
+}
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.h
+#ifndef GIFT128_H_
+#define GIFT128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+extern void precompute_rkeys(u32* rkeys, const u8* key);
+extern void giftb128(u8* out, const u8* in, const u32* rkeys);
+
+#define ROR(x,y)											\
+	(((x) >> (y)) | ((x) << (32 - (y))))
+#define BYTE_ROR_2(x)										\
+	((((x) >> 2) & 0x3f3f3f3f)	| (((x) & 0x03030303) << 6))
+#define BYTE_ROR_4(x) 										\
+	((((x) >> 4) & 0x0f0f0f0f)	| (((x) & 0x0f0f0f0f) << 4))
+#define BYTE_ROR_6(x) 										\
+	((((x) >> 6) & 0x03030303)	| (((x) & 0x3f3f3f3f) << 2))
+#define HALF_ROR_4(x) 										\
+	((((x) >> 4) & 0x0fff0fff)	| (((x) & 0x000f000f) << 12))
+#define HALF_ROR_8(x) 										\
+	((((x) >> 8) & 0x00ff00ff)	| (((x) & 0x00ff00ff) << 8))
+#define HALF_ROR_12(x) 										\
+	((((x) >> 12)& 0x000f000f)	| (((x) & 0x0fff0fff) << 4))
+#define NIBBLE_ROR_1(x)										\
+	((((x) >> 1) & 0x77777777) 	| (((x) & 0x11111111) << 3))
+#define NIBBLE_ROR_2(x)										\
+	((((x) >> 2) & 0x33333333) 	| (((x) & 0x33333333) << 2))
+#define NIBBLE_ROR_3(x)										\
+	((((x) >> 3) & 0x11111111) 	| (((x) & 0x77777777) << 1))
+
+#define SWAPMOVE(a, b, mask, n)								\
+	tmp = (b ^ (a >> n)) & mask;							\
+	b ^= tmp;												\
+	a ^= (tmp << n);
+
+#define SBOX(s0, s1, s2, s3)								\
+	s1 ^= s0 & s2;											\
+	s0 ^= s1 & s3;											\
+	s2 ^= s0 | s1;											\
+	s3 ^= s2;												\
+	s1 ^= s3;												\
+	s3 ^= 0xffffffff;										\
+	s2 ^= s0 & s1;
+
+#define QUINTUPLE_ROUND(state, rkey, rconst) ({				\
+	SBOX(state[0], state[1], state[2], state[3]);			\
+	state[3] = NIBBLE_ROR_1(state[3]);						\
+	state[1] = NIBBLE_ROR_2(state[1]);						\
+	state[2] = NIBBLE_ROR_3(state[2]);						\
+	state[1] ^= (rkey)[0];									\
+	state[2] ^= (rkey)[1];									\
+	state[0] ^= (rconst)[0];								\
+	SBOX(state[3], state[1], state[2], state[0]);			\
+	state[0] = HALF_ROR_4(state[0]);						\
+	state[1] = HALF_ROR_8(state[1]);						\
+	state[2] = HALF_ROR_12(state[2]);						\
+	state[1] ^= (rkey)[2];									\
+	state[2] ^= (rkey)[3];									\
+	state[3] ^= (rconst)[1];								\
+	SBOX(state[0], state[1], state[2], state[3]);			\
+	state[3] = ROR(state[3], 16);							\
+	state[2] = ROR(state[2], 16);							\
+	SWAPMOVE(state[1], state[1], 0x55555555, 1);			\
+	SWAPMOVE(state[2], state[2], 0x00005555, 1);			\
+	SWAPMOVE(state[3], state[3], 0x55550000, 1);			\
+	state[1] ^= (rkey)[4];									\
+	state[2] ^= (rkey)[5];									\
+	state[0] ^= (rconst)[2];								\
+	SBOX(state[3], state[1], state[2], state[0]);			\
+	state[0] = BYTE_ROR_6(state[0]);						\
+	state[1] = BYTE_ROR_4(state[1]);						\
+	state[2] = BYTE_ROR_2(state[2]);						\
+	state[1] ^= (rkey)[6];									\
+	state[2] ^= (rkey)[7];									\
+	state[3] ^= (rconst)[3];								\
+	SBOX(state[0], state[1], state[2], state[3]);			\
+	state[3] = ROR(state[3], 24);							\
+	state[1] = ROR(state[1], 16);							\
+	state[2] = ROR(state[2], 8);							\
+	state[1] ^= (rkey)[8];									\
+	state[2] ^= (rkey)[9];									\
+	state[0] ^= (rconst)[4];								\
+	state[0] ^= state[3];									\
+	state[3] ^= state[0];									\
+	state[0] ^= state[3];									\
+})
+
+#endif  // GIFT128_H_
\ No newline at end of file
--- a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/key_schedule.h
+++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/key_schedule.h
+#ifndef KEYSCHEDULE_H_
+#define KEYSCHEDULE_H_
+
+#define REARRANGE_RKEY_0(x) ({			\
+	SWAPMOVE(x, x, 0x00550055, 9);		\
+	SWAPMOVE(x, x, 0x000f000f, 12);		\
+	SWAPMOVE(x, x, 0x00003333, 18);		\
+	SWAPMOVE(x, x, 0x000000ff, 24);		\
+})
+
+#define REARRANGE_RKEY_1(x) ({			\
+	SWAPMOVE(x, x, 0x11111111, 3);		\
+	SWAPMOVE(x, x, 0x03030303, 6);		\
+	SWAPMOVE(x, x, 0x000f000f, 12);		\
+	SWAPMOVE(x, x, 0x000000ff, 24);		\
+})
+
+#define REARRANGE_RKEY_2(x) ({			\
+	SWAPMOVE(x, x, 0x0000aaaa, 15);		\
+	SWAPMOVE(x, x, 0x00003333, 18);		\
+	SWAPMOVE(x, x, 0x0000f0f0, 12);		\
+	SWAPMOVE(x, x, 0x000000ff, 24);		\
+})
+
+#define REARRANGE_RKEY_3(x) ({			\
+	SWAPMOVE(x, x, 0x0a0a0a0a, 3);		\
+	SWAPMOVE(x, x, 0x00cc00cc, 6);		\
+	SWAPMOVE(x, x, 0x0000f0f0, 12);		\
+	SWAPMOVE(x, x, 0x000000ff, 24);		\
+})
+
+#define KEY_UPDATE(x)											\
+	(((x) >> 12) & 0x0000000f)	| (((x) & 0x00000fff) << 4) | 	\
+	(((x) >> 2) & 0x3fff0000)	| (((x) & 0x00030000) << 14)
+
+#define KEY_TRIPLE_UPDATE_0(x)									\
+	(ROR((x) & 0x33333333, 24) 	| ROR((x) & 0xcccccccc, 16))
+
+#define KEY_DOUBLE_UPDATE_1(x)									\
+	((((x) >> 4) & 0x0f000f00)	| (((x) & 0x0f000f00) << 4) | 	\
+	(((x) >> 6) & 0x00030003)	| (((x) & 0x003f003f) << 2))
+
+#define KEY_TRIPLE_UPDATE_1(x)									\
+	((((x) >> 6) & 0x03000300)	| (((x) & 0x3f003f00) << 2) | 	\
+	(((x) >> 5) & 0x00070007)	| (((x) & 0x001f001f) << 3))
+
+#define KEY_DOUBLE_UPDATE_2(x)									\
+	(ROR((x) & 0xaaaaaaaa, 24)	| ROR((x) & 0x55555555, 16))
+
+#define KEY_TRIPLE_UPDATE_2(x)									\
+	(ROR((x) & 0x55555555, 24)	| ROR((x) & 0xaaaaaaaa, 20))
+
+#define KEY_DOUBLE_UPDATE_3(x)									\
+	((((x) >> 2) & 0x03030303)	| (((x) & 0x03030303) << 2) | 	\
+	(((x) >> 1) & 0x70707070)	| (((x) & 0x10101010) << 3))
+
+#define KEY_TRIPLE_UPDATE_3(x)									\
+	((((x) >> 18) & 0x00003030)	| (((x) & 0x01010101) << 3) | 	\
+	(((x) >> 14) & 0x0000c0c0)	| (((x) & 0x0000e0e0) << 15)|	\
+	(((x) >> 1) & 0x07070707)	| (((x) & 0x00001010) << 19))
+
+#define KEY_DOUBLE_UPDATE_4(x)									\
+	((((x) >> 4)  & 0x0fff0000)	| (((x) & 0x000f0000) << 12) | 	\
+	(((x) >> 8)  & 0x000000ff)	| (((x) & 0x000000ff) << 8))
+
+#define KEY_TRIPLE_UPDATE_4(x)									\
+	((((x) >> 6)  & 0x03ff0000)	| (((x) & 0x003f0000) << 10) |	\
+	(((x) >> 4)  & 0x00000fff)	| (((x) & 0x0000000f) << 12))
+
+#endif  // KEYSCHEDULE_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/crypto_aead.h
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/encrypt.c
+/*
+ * Date: 29 November 2018
+ * Contact: Thomas Peyrin - thomas.peyrin@gmail.com
+ * Mustafa Khairallah - mustafam001@e.ntu.edu.sg
+ */
+
+#include "crypto_aead.h"
+#include "api.h"
+#include "skinny.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void pad (const unsigned char* m, unsigned char* mp, int l, int len8) {
+  int i;
+
+  for (i = 0; i < l; i++) {
+    if (i < len8) {      
+      mp[i] = m[i];
+    }
+    else if (i == l - 1) {
+      mp[i] = (len8 & 0x0f);
+    }
+    else {
+      mp[i] = 0x00;
+    }      
+  }
+  
+}
+
+void g8A (unsigned char* s, unsigned char* c) {
+    unsigned int tmps[4];
+    unsigned int tmpc[4];
+
+    tmps[0] = *((unsigned int *)&s[0]);
+    tmps[1] = *((unsigned int *)&s[4]);
+    tmps[2] = *((unsigned int *)&s[8]);
+    tmps[3] = *((unsigned int *)&s[12]);
+
+    // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7);
+    //
+    // (s[i] >> 1)          -> ((s[i]>>1)&0x7f)
+    // (s[i] & 0x80)        ->  (s[i])&0x80)  not changed
+    // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80)
+
+    // use word access because of speeding up
+    tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080);
+    tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080);
+    tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080);
+    tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080);
+
+    *((unsigned int *)&c[0]) = tmpc[0];
+    *((unsigned int *)&c[4]) = tmpc[1];
+    *((unsigned int *)&c[8]) = tmpc[2];
+    *((unsigned int *)&c[12]) = tmpc[3];
+}
+
+void g8A_for_Tag_Generation (unsigned char* s, unsigned char* c) {
+    unsigned int tmps[4];
+    unsigned int tmpc[4];
+
+    tmps[0] = *((unsigned int *)&s[0]);
+    tmps[1] = *((unsigned int *)&s[4]);
+    tmps[2] = *((unsigned int *)&s[8]);
+    tmps[3] = *((unsigned int *)&s[12]);
+
+    // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7);
+    //
+    // (s[i] >> 1)          -> ((s[i]>>1)&0x7f)
+    // (s[i] & 0x80)        ->  (s[i])&0x80)  not changed
+    // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80)
+
+    // use word access because of speeding up
+    tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080);
+    tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080);
+    tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080);
+    tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080);
+
+    // use byte access because of memory alignment.
+    // c is not always in word(4 byte) alignment.
+    c[0] =   tmpc[0]     &0xFF;
+    c[1] =  (tmpc[0]>>8) &0xFF;
+    c[2] =  (tmpc[0]>>16)&0xFF;
+    c[3] =  (tmpc[0]>>24)&0xFF;
+    c[4] =   tmpc[1]     &0xFF;
+    c[5] =  (tmpc[1]>>8) &0xFF;
+    c[6] =  (tmpc[1]>>16)&0xFF;
+    c[7] =  (tmpc[1]>>24)&0xFF;
+    c[8] =   tmpc[2]     &0xFF;
+    c[9] =  (tmpc[2]>>8) &0xFF;
+    c[10] = (tmpc[2]>>16)&0xFF;
+    c[11] = (tmpc[2]>>24)&0xFF;
+    c[12] =  tmpc[3]     &0xFF;
+    c[13] = (tmpc[3]>>8) &0xFF;
+    c[14] = (tmpc[3]>>16)&0xFF;
+    c[15] = (tmpc[3]>>24)&0xFF;
+}
+
+void rho_ad_eqov16 (const unsigned char* m,
+	     unsigned char* s) {
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&m[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&m[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&m[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]);
+}
+
+void rho_ad_ud16 (const unsigned char* m,
+	     unsigned char* s,
+	     int len8) {
+  unsigned char mp [16];
+
+  pad(m,mp,16,len8);
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&mp[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&mp[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&mp[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]);
+}
+
+void rho_eqov16 (const unsigned char* m,
+	  unsigned char* c,
+	  unsigned char* s) {
+  g8A(s,c);
+
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&m[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&m[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&m[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]);
+
+  *((unsigned int *)&c[0])  ^= *((unsigned int *)&m[0]);
+  *((unsigned int *)&c[4])  ^= *((unsigned int *)&m[4]);
+  *((unsigned int *)&c[8])  ^= *((unsigned int *)&m[8]);
+  *((unsigned int *)&c[12]) ^= *((unsigned int *)&m[12]);
+}
+
+void rho_ud16 (const unsigned char* m,
+	  unsigned char* c,
+	  unsigned char* s,
+	  int len8,
+	  int ver) {
+  int i;
+  unsigned char mp [16];
+
+  pad(m,mp,ver,len8);
+
+  g8A(s,c);
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&mp[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&mp[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&mp[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]);
+  for (i = 0; i < ver; i++) {
+    if (i < len8) {
+      c[i] = c[i] ^ mp[i];
+    }
+    else {
+      c[i] = 0;
+    }
+  }
+}
+
+void irho (unsigned char* m,
+	  const unsigned char* c,
+	  unsigned char* s,
+	  int len8,
+	  int ver) {
+  int i;
+  unsigned char cp [16];
+
+  pad(c,cp,ver,len8);
+
+  g8A(s,m);
+  for (i = 0; i < ver; i++) {
+    if (i < len8) {
+      s[i] = s[i] ^ cp[i] ^ m[i];
+    }
+    else {
+      s[i] = s[i] ^ cp[i];
+    }
+    if (i < len8) {
+      m[i] = m[i] ^ cp[i];
+    }
+    else {
+      m[i] = 0;
+    }
+  }
+}
+
+void reset_lfsr_gf56 (unsigned char* CNT) {
+    *((unsigned int *)&CNT[0]) = 0x00000001;
+    *((unsigned int *)&CNT[4]) = 0x00000000;
+}
+
+void lfsr_gf56 (unsigned char* CNT) {
+    unsigned int tmpCNT[2];
+    unsigned int fb0;
+
+    tmpCNT[0] = *((unsigned int *)&CNT[0]);	// CNT3 CNT2 CNT1 CNT0
+    tmpCNT[1] = *((unsigned int *)&CNT[4]);	// CNT7 CNT6 CNT5 CNT4
+
+    fb0 = 0;
+    if ((tmpCNT[1] >> 23)&0x01) {
+        fb0 =  0x95;
+    }
+
+    tmpCNT[1] = tmpCNT[1] << 1 | tmpCNT[0] >> 31;
+    tmpCNT[0] = tmpCNT[0] << 1 ^ fb0;
+
+    *((unsigned int *)&CNT[0]) = tmpCNT[0];
+    *((unsigned int *)&CNT[4]) = tmpCNT[1];
+}
+
+void block_cipher(unsigned char* s,
+		  const unsigned char* k, unsigned char* T,
+		  unsigned char* CNT,
+		  skinny_ctrl* p_skinny_ctrl) {
+  p_skinny_ctrl->func_skinny_128_384_enc (s,p_skinny_ctrl,CNT,T,k);
+}
+
+void nonce_encryption (const unsigned char* N,
+		       unsigned char* CNT,
+		       unsigned char*s, const unsigned char* k,
+		       unsigned char D,
+		       skinny_ctrl* p_skinny_ctrl) {
+  unsigned char T [16];
+  *((unsigned int *)&T[0])  = *((unsigned int *)&N[0]);
+  *((unsigned int *)&T[4])  = *((unsigned int *)&N[4]);
+  *((unsigned int *)&T[8])  = *((unsigned int *)&N[8]);
+  *((unsigned int *)&T[12]) = *((unsigned int *)&N[12]);
+  CNT[7] = D;
+  block_cipher(s,k,T,CNT,p_skinny_ctrl);
+
+}
+
+void generate_tag (unsigned char** c, unsigned char* s,
+		   int n, unsigned long long* clen) {
+  
+  g8A_for_Tag_Generation(s, *c);
+  *c = *c + n;
+  *c = *c - *clen;
+
+}
+
+unsigned long long msg_encryption (const unsigned char** M, unsigned char** c,
+				   const unsigned char* N,
+				   unsigned char* CNT,
+				   unsigned char*s, const unsigned char* k,
+				   unsigned char D,
+				   unsigned long long mlen,
+				   skinny_ctrl* p_skinny_ctrl) {
+  int len8;
+  
+  if (mlen >= 16) {
+    len8 = 16;
+    mlen = mlen - 16;
+    rho_eqov16(*M, *c, s);
+  }
+  else {
+    len8 = mlen;
+    mlen = 0;
+    rho_ud16(*M, *c, s, len8, 16);
+  }
+  *c = *c + len8;
+  *M = *M + len8;
+  lfsr_gf56(CNT);
+  if (mlen != 0) {
+    nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl);
+  }
+  return mlen;
+}
+
+
+
+unsigned long long msg_decryption (unsigned char** M, const unsigned char** c,
+				   const unsigned char* N,
+				   unsigned char* CNT,
+				   unsigned char*s, const unsigned char* k,
+				   unsigned char D,
+				   unsigned long long clen,
+				   skinny_ctrl* p_skinny_ctrl) {
+  int len8;
+
+  if (clen >= 16) {
+    len8 = 16;
+    clen = clen - 16;
+  }
+  else {
+    len8 = clen;
+    clen = 0;
+  }
+  irho(*M, *c, s, len8, 16);
+  *c = *c + len8;
+  *M = *M + len8;
+  lfsr_gf56(CNT);
+  nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl);
+  return clen;
+}
+
+unsigned long long ad2msg_encryption (const unsigned char** M,
+				      unsigned char* CNT,
+				      unsigned char*s, const unsigned char* k,
+				      unsigned char D,
+				      unsigned long long mlen,
+				      skinny_ctrl* p_skinny_ctrl) {
+  unsigned char T [16];
+  int len8;
+
+  if (mlen <= 16) {
+    len8 = mlen;
+    mlen = 0;
+
+    pad (*M,T,16,len8);
+  }
+  else {
+    len8 = 16;
+    mlen = mlen - 16;
+
+    unsigned char *pM = (unsigned char *)(*M);
+    *((unsigned int *)&T[0])  = *((unsigned int *)&pM[0]);
+    *((unsigned int *)&T[4])  = *((unsigned int *)&pM[4]);
+    *((unsigned int *)&T[8])  = *((unsigned int *)&pM[8]);
+    *((unsigned int *)&T[12]) = *((unsigned int *)&pM[12]);
+  }
+
+  CNT[7] = D;
+  block_cipher(s,k,T,CNT,p_skinny_ctrl);
+  lfsr_gf56(CNT);
+  *M = *M + len8;
+  
+  return mlen;
+
+}
+
+
+unsigned long long ad_encryption (const unsigned char** A, unsigned char* s,
+				  const unsigned char* k, unsigned long long adlen,
+				  unsigned char* CNT,
+				  unsigned char D,				  
+				  skinny_ctrl* p_skinny_ctrl) {
+
+  unsigned char T [16];
+  int len8;
+  
+  if (adlen >= 16) {
+    len8 = 16;
+    adlen = adlen - 16;
+    rho_ad_eqov16(*A, s);
+  }
+  else {
+    len8 = adlen;
+    adlen = 0;
+    rho_ad_ud16(*A, s, len8);
+  }
+  *A = *A + len8;
+  lfsr_gf56(CNT);
+  if (adlen != 0) {
+    if (adlen >= 16) {
+      len8 = 16;
+      adlen = adlen - 16;
+
+      unsigned char *pA = (unsigned char *)(*A);
+      *((unsigned int *)&T[0])  = *((unsigned int *)&pA[0]);
+      *((unsigned int *)&T[4])  = *((unsigned int *)&pA[4]);
+      *((unsigned int *)&T[8])  = *((unsigned int *)&pA[8]);
+      *((unsigned int *)&T[12]) = *((unsigned int *)&pA[12]);
+    }
+    else {
+      len8 = adlen;
+      adlen = 0;    
+
+      pad(*A, T, 16, len8);
+    }
+    *A = *A + len8;
+    CNT[7] = D;
+    block_cipher(s,k,T,CNT,p_skinny_ctrl);
+    lfsr_gf56(CNT);
+  }
+
+  return adlen;
+}
+
+int crypto_aead_encrypt (
+			 unsigned char* c, unsigned long long* clen,
+			 const unsigned char* m, unsigned long long mlen,
+			 const unsigned char* ad, unsigned long long adlen,
+			 const unsigned char* nsec,
+			 const unsigned char* npub,
+			 const unsigned char* k
+			 )
+{
+  unsigned char s[16];
+  unsigned char CNT[8];  // size 7 -> 8 for word access
+  unsigned char T[16];
+  const unsigned char* N;
+  unsigned char w;
+  unsigned long long xlen;
+
+  skinny_ctrl l_skinny_ctrl;
+  l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12;
+
+  (void)nsec;
+  N = npub;
+  
+  xlen = mlen;
+
+  *((unsigned int *)&s[0])  = 0x00000000;
+  *((unsigned int *)&s[4])  = 0x00000000;
+  *((unsigned int *)&s[8])  = 0x00000000;
+  *((unsigned int *)&s[12]) = 0x00000000;
+  reset_lfsr_gf56(CNT);
+
+  w = 48;
+
+  if (adlen == 0) {
+    w = w ^ 2;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else if (adlen%(32) == 0) {
+    w = w ^ 8;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else if (adlen%(32) < 16) {
+    w = w ^ 2;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else if (adlen%(32) == 16) {
+    w = w ^ 0;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else {
+    w = w ^ 10;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  
+  if (adlen == 0) { // AD is an empty string
+    lfsr_gf56(CNT);
+  }
+  else while (adlen > 0) {
+      adlen = ad_encryption(&ad,s,k,adlen,CNT,40,&l_skinny_ctrl);
+    }
+
+  if ((w & 8) == 0) {
+    xlen = ad2msg_encryption (&m,CNT,s,k,44,xlen,&l_skinny_ctrl);
+  }
+  else if (mlen == 0) {
+    lfsr_gf56(CNT);    
+  }
+  while (xlen > 0) {
+    xlen = ad_encryption(&m,s,k,xlen,CNT,44,&l_skinny_ctrl);
+  }
+  nonce_encryption(N,CNT,s,k,w,&l_skinny_ctrl);
+
+  // because, nonce_encryption is called at the last block of AD encryption
+  l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc1_1;
+
+  // Tag generation 
+  g8A(s, T);
+
+  m = m - mlen;
+  
+  reset_lfsr_gf56(CNT);
+
+  *((unsigned int *)&s[0])  = *((unsigned int *)&T[0]);
+  *((unsigned int *)&s[4])  = *((unsigned int *)&T[4]);
+  *((unsigned int *)&s[8])  = *((unsigned int *)&T[8]);
+  *((unsigned int *)&s[12]) = *((unsigned int *)&T[12]);
+
+  *clen = mlen + 16;
+
+  if (mlen > 0) {
+    nonce_encryption(N,CNT,s,k,36,&l_skinny_ctrl);
+    while (mlen > 16) {
+      mlen = msg_encryption(&m,&c,N,CNT,s,k,36,mlen,&l_skinny_ctrl);
+    }
+    rho_ud16(m, c, s, mlen, 16);
+    c = c + mlen;
+    m = m + mlen;
+  }
+
+  // Tag Concatenation
+
+  // use byte access because of memory alignment.
+  // c is not always in word(4 byte) alignment.
+
+  for (int i = 0; i < 16; i = i + 1) {
+    *(c + i) = T[i];
+  }
+
+  c = c - *clen;
+
+  return 0;
+}
+
+int crypto_aead_decrypt(
+unsigned char *m,unsigned long long *mlen,
+unsigned char *nsec,
+const unsigned char *c,unsigned long long clen,
+const unsigned char *ad,unsigned long long adlen,
+const unsigned char *npub,
+const unsigned char *k
+)
+{
+  unsigned char s[16];
+  unsigned char CNT[8];  // size 7 -> 8 for word access
+  unsigned char T[16];
+  const unsigned char* N;
+  unsigned char w;
+  unsigned long long xlen;
+  const unsigned char* mauth;
+
+  skinny_ctrl l_skinny_ctrl;
+  l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12;
+
+  (void)nsec;
+  mauth = m;
+
+  N = npub;
+  
+  xlen = clen-16;
+
+  reset_lfsr_gf56(CNT);
+
+  for (int i = 0; i < 16; i++) {
+    T[i] = *(c + clen - 16 + i);
+  }
+
+  *((unsigned int *)&s[0])  = *((unsigned int *)&T[0]);
+  *((unsigned int *)&s[4])  = *((unsigned int *)&T[4]);
+  *((unsigned int *)&s[8])  = *((unsigned int *)&T[8]);
+  *((unsigned int *)&s[12]) = *((unsigned int *)&T[12]);
+
+  clen = clen - 16;
+  *mlen = clen;
+
+  if (clen > 0) {    
+    nonce_encryption(N,CNT,s,k,36,&l_skinny_ctrl);
+    while (clen > 16) {
+      clen = msg_decryption(&m,&c,N,CNT,s,k,36,clen,&l_skinny_ctrl);
+    }
+    irho(m, c, s, clen, 16);
+    c = c + clen;
+    m = m + clen;
+  }
+
+  *((unsigned int *)&s[0])  = 0x00000000;
+  *((unsigned int *)&s[4])  = 0x00000000;
+  *((unsigned int *)&s[8])  = 0x00000000;
+  *((unsigned int *)&s[12]) = 0x00000000;
+  reset_lfsr_gf56(CNT);
+
+  w = 48;
+  
+  if (adlen == 0) {
+    w = w ^ 2;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else if (adlen%(32) == 0) {
+    w = w ^ 8;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else if (adlen%(32) < 16) {
+    w = w ^ 2;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else if (adlen%(32) == 16) {
+    w = w ^ 0;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  else {
+    w = w ^ 10;
+    if (xlen == 0) {
+      w =w ^ 1;
+    }
+    else if (xlen%(32) == 0) {
+      w = w ^ 4;
+    }
+    else if (xlen%(32) < 16) {
+      w = w ^ 1;
+    }
+    else if (xlen%(32) == 16) {
+      w = w ^ 0;
+    }
+    else {
+      w = w ^ 5;
+    }
+  }
+  
+  if (adlen == 0) { // AD is an empty string
+    lfsr_gf56(CNT);
+  }
+  else while (adlen > 0) {
+      adlen = ad_encryption(&ad,s,k,adlen,CNT,40,&l_skinny_ctrl);
+    }
+
+  if ((w & 8) == 0) {
+    xlen = ad2msg_encryption (&mauth,CNT,s,k,44,xlen,&l_skinny_ctrl);
+  }
+  else if (clen == 0) {
+    lfsr_gf56(CNT);
+  }  
+  while (xlen > 0) {
+    xlen = ad_encryption(&mauth,s,k,xlen,CNT,44,&l_skinny_ctrl);
+  }
+  nonce_encryption(N,CNT,s,k,w,&l_skinny_ctrl);
+
+  // Tag generation 
+  g8A_for_Tag_Generation(s, T);
+  for (int i = 0; i < 16; i++) {
+    if (T[i] != (*(c+i))) {
+      return -1;
+    }
+  }
+  
+  return 0;
+}
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/genkat_aead.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/genkat_aead.c
+//
+// NIST-developed software is provided by NIST as a public service.
+// You may use, copy and distribute copies of the software in any medium,
+// provided that you keep intact this entire notice. You may improve, 
+// modify and create derivative works of the software or any portion of
+// the software, and you may copy and distribute such modifications or
+// works. Modified works should carry a notice stating that you changed
+// the software and should note the date and nature of any such change.
+// Please explicitly acknowledge the National Institute of Standards and 
+// Technology as the source of the software.
+//
+// NIST-developed software is expressly provided "AS IS." NIST MAKES NO 
+// WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT OR ARISING BY OPERATION
+// OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT AND DATA ACCURACY. NIST
+// NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE 
+// UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST 
+// DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE
+// OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY,
+// RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+//
+// You are solely responsible for determining the appropriateness of using and 
+// distributing the software and you assume all risks associated with its use, 
+// including but not limited to the risks and costs of program errors, compliance 
+// with applicable laws, damage to or loss of data, programs or equipment, and 
+// the unavailability or interruption of operation. This software is not intended
+// to be used in any situation where a failure could cause risk of injury or 
+// damage to property. The software developed by NIST employees is not subject to
+// copyright protection within the United States.
+//
+
+// disable deprecation for sprintf and fopen
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include "crypto_aead.h"
+#include "api.h"
+
+#define KAT_SUCCESS          0
+#define KAT_FILE_OPEN_ERROR -1
+#define KAT_DATA_ERROR      -3
+#define KAT_CRYPTO_FAILURE  -4
+
+#define MAX_FILE_NAME				256
+#define MAX_MESSAGE_LENGTH			32
+#define MAX_ASSOCIATED_DATA_LENGTH	32
+
+void init_buffer(unsigned char *buffer, unsigned long long numbytes);
+
+void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length);
+
+int generate_test_vectors();
+
+int main()
+{
+	int ret = generate_test_vectors();
+
+	if (ret != KAT_SUCCESS) {
+		fprintf(stderr, "test vector generation failed with code %d\n", ret);
+	}
+
+	return ret;
+}
+
+int generate_test_vectors()
+{
+	FILE                *fp;
+	char                fileName[MAX_FILE_NAME];
+	unsigned char       key[CRYPTO_KEYBYTES];
+	unsigned char		nonce[CRYPTO_NPUBBYTES];
+	unsigned char       msg[MAX_MESSAGE_LENGTH];
+	unsigned char       msg2[MAX_MESSAGE_LENGTH];
+	unsigned char		ad[MAX_ASSOCIATED_DATA_LENGTH];
+	unsigned char		ct[MAX_MESSAGE_LENGTH + CRYPTO_ABYTES];
+	unsigned long long  clen, mlen2;
+	int                 count = 1;
+	int                 func_ret, ret_val = KAT_SUCCESS;
+
+	init_buffer(key, sizeof(key));
+	init_buffer(nonce, sizeof(nonce));
+	init_buffer(msg, sizeof(msg));
+	init_buffer(ad, sizeof(ad));
+
+	sprintf(fileName, "LWC_AEAD_KAT_%d_%d.txt", (CRYPTO_KEYBYTES * 8), (CRYPTO_NPUBBYTES * 8));
+
+	if ((fp = fopen(fileName, "w")) == NULL) {
+		fprintf(stderr, "Couldn't open <%s> for write\n", fileName);
+		return KAT_FILE_OPEN_ERROR;
+	}
+
+	for (unsigned long long mlen = 0; (mlen <= MAX_MESSAGE_LENGTH) && (ret_val == KAT_SUCCESS); mlen++) {
+	  for (unsigned long long adlen = 0; adlen <= MAX_ASSOCIATED_DATA_LENGTH; adlen++) {
+
+	    printf("%0d\n", (int)clen);
+
+			fprintf(fp, "Count = %d\n", count++);
+			printf("Count = %d\n", count - 1);
+
+			fprint_bstr(fp, "Key = ", key, CRYPTO_KEYBYTES);
+
+			fprint_bstr(fp, "Nonce = ", nonce, CRYPTO_NPUBBYTES);
+
+			fprint_bstr(fp, "PT = ", msg, mlen);
+
+			fprint_bstr(fp, "AD = ", ad, adlen);
+
+			if ((func_ret = crypto_aead_encrypt(ct, &clen, msg, mlen, ad, adlen, NULL, nonce, key)) != 0) {
+				fprintf(fp, "crypto_aead_encrypt returned <%d>\n", func_ret);
+				ret_val = KAT_CRYPTO_FAILURE;
+				break;
+			}
+			
+			fprint_bstr(fp, "CT = ", ct, clen);
+
+			fprintf(fp, "\n");
+
+			 if ((func_ret = crypto_aead_decrypt(msg2, &mlen2, NULL, ct, clen, ad, adlen, nonce, key)) != 0) { 
+			 	fprintf(fp, "crypto_aead_decrypt returned <%d>\n", func_ret); 
+			 	ret_val = KAT_CRYPTO_FAILURE; 
+			 	break; 
+			 } 
+
+			 if (mlen != mlen2) { 
+			 	fprintf(fp, "crypto_aead_decrypt returned bad 'mlen': Got <%llu>, expected <%llu>\n", mlen2, mlen); 
+			 	ret_val = KAT_CRYPTO_FAILURE; 
+			 	break; 
+			 } 
+
+			 if (memcmp(msg, msg2, mlen)) { 
+			 	fprintf(fp, "crypto_aead_decrypt did not recover the plaintext\n"); 
+			 	ret_val = KAT_CRYPTO_FAILURE; 
+			 	break; 
+			 } 
+		}
+	}
+
+	fclose(fp);
+
+	return ret_val;
+}
+
+
+void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length)
+{    
+    fprintf(fp, "%s", label);
+        
+	for (unsigned long long i = 0; i < length; i++)
+		fprintf(fp, "%02X", data[i]);
+	    
+    fprintf(fp, "\n");
+}
+
+void init_buffer(unsigned char *buffer, unsigned long long numbytes)
+{
+	for (unsigned long long i = 0; i < numbytes; i++)
+		buffer[i] = (unsigned char)i;
+}
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny.h
+typedef struct ___skinny_ctrl {
+    unsigned char roundKeys[960]; // number of round : 56
+    void (*func_skinny_128_384_enc)(unsigned char*, struct ___skinny_ctrl*, unsigned char* CNT, unsigned char* T, const unsigned char* K);
+} skinny_ctrl;
+
+extern void skinny_128_384_enc123_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K);
+extern void skinny_128_384_enc12_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K);
+extern void skinny_128_384_enc1_1 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K);
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule2.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule2.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule3.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule3.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_main.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_main.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/crypto_aead.h
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/encrypt.c
+/*
+ * Date: 29 November 2018
+ * Contact: Thomas Peyrin - thomas.peyrin@gmail.com
+ * Mustafa Khairallah - mustafam001@e.ntu.edu.sg
+ */
+
+#include "crypto_aead.h"
+#include "api.h"
+#include "skinny.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void pad (const unsigned char* m, unsigned char* mp, int l, int len8) {
+  int i;
+
+  for (i = 0; i < l; i++) {
+    if (i < len8) {      
+      mp[i] = m[i];
+    }
+    else if (i == l - 1) {
+      mp[i] = (len8 & 0x0f);
+    }
+    else {
+      mp[i] = 0x00;
+    }      
+  }
+  
+}
+
+void g8A (unsigned char* s, unsigned char* c) {
+    unsigned int tmps[4];
+    unsigned int tmpc[4];
+
+    tmps[0] = *((unsigned int *)&s[0]);
+    tmps[1] = *((unsigned int *)&s[4]);
+    tmps[2] = *((unsigned int *)&s[8]);
+    tmps[3] = *((unsigned int *)&s[12]);
+
+    // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7);
+    //
+    // (s[i] >> 1)          -> ((s[i]>>1)&0x7f)
+    // (s[i] & 0x80)        ->  (s[i])&0x80)  not changed
+    // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80)
+
+    // use word access because of speeding up
+    tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080);
+    tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080);
+    tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080);
+    tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080);
+
+    *((unsigned int *)&c[0]) = tmpc[0];
+    *((unsigned int *)&c[4]) = tmpc[1];
+    *((unsigned int *)&c[8]) = tmpc[2];
+    *((unsigned int *)&c[12]) = tmpc[3];
+}
+
+void g8A_for_Tag_Generation (unsigned char* s, unsigned char* c) {
+    unsigned int tmps[4];
+    unsigned int tmpc[4];
+
+    tmps[0] = *((unsigned int *)&s[0]);
+    tmps[1] = *((unsigned int *)&s[4]);
+    tmps[2] = *((unsigned int *)&s[8]);
+    tmps[3] = *((unsigned int *)&s[12]);
+
+    // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7);
+    //
+    // (s[i] >> 1)          -> ((s[i]>>1)&0x7f)
+    // (s[i] & 0x80)        ->  (s[i])&0x80)  not changed
+    // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80)
+
+    // use word access because of speeding up
+    tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080);
+    tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080);
+    tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080);
+    tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080);
+
+    // use byte access because of memory alignment.
+    // c is not always in word(4 byte) alignment.
+    c[0] =   tmpc[0]     &0xFF;
+    c[1] =  (tmpc[0]>>8) &0xFF;
+    c[2] =  (tmpc[0]>>16)&0xFF;
+    c[3] =  (tmpc[0]>>24)&0xFF;
+    c[4] =   tmpc[1]     &0xFF;
+    c[5] =  (tmpc[1]>>8) &0xFF;
+    c[6] =  (tmpc[1]>>16)&0xFF;
+    c[7] =  (tmpc[1]>>24)&0xFF;
+    c[8] =   tmpc[2]     &0xFF;
+    c[9] =  (tmpc[2]>>8) &0xFF;
+    c[10] = (tmpc[2]>>16)&0xFF;
+    c[11] = (tmpc[2]>>24)&0xFF;
+    c[12] =  tmpc[3]     &0xFF;
+    c[13] = (tmpc[3]>>8) &0xFF;
+    c[14] = (tmpc[3]>>16)&0xFF;
+    c[15] = (tmpc[3]>>24)&0xFF;
+}
+
+void rho_ad_eqov16 (const unsigned char* m,
+	     unsigned char* s) {
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&m[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&m[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&m[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]);
+}
+
+void rho_ad_ud16 (const unsigned char* m,
+	     unsigned char* s,
+	     int len8) {
+  unsigned char mp [16];
+
+  pad(m,mp,16,len8);
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&mp[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&mp[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&mp[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]);
+}
+
+void rho_eqov16 (const unsigned char* m,
+	  unsigned char* c,
+	  unsigned char* s) {
+  g8A(s,c);
+
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&m[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&m[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&m[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]);
+
+  *((unsigned int *)&c[0])  ^= *((unsigned int *)&m[0]);
+  *((unsigned int *)&c[4])  ^= *((unsigned int *)&m[4]);
+  *((unsigned int *)&c[8])  ^= *((unsigned int *)&m[8]);
+  *((unsigned int *)&c[12]) ^= *((unsigned int *)&m[12]);
+}
+
+void rho_ud16 (const unsigned char* m,
+	  unsigned char* c,
+	  unsigned char* s,
+	  int len8,
+	  int ver) {
+  int i;
+  unsigned char mp [16];
+
+  pad(m,mp,ver,len8);
+
+  g8A(s,c);
+  *((unsigned int *)&s[0])  ^= *((unsigned int *)&mp[0]);
+  *((unsigned int *)&s[4])  ^= *((unsigned int *)&mp[4]);
+  *((unsigned int *)&s[8])  ^= *((unsigned int *)&mp[8]);
+  *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]);
+  for (i = 0; i < ver; i++) {
+    if (i < len8) {
+      c[i] = c[i] ^ mp[i];
+    }
+    else {
+      c[i] = 0;
+    }
+  }
+}
+
+void irho (unsigned char* m,
+	  const unsigned char* c,
+	  unsigned char* s,
+	  int len8,
+	  int ver) {
+  int i;
+  unsigned char cp [16];
+
+  pad(c,cp,ver,len8);
+
+  g8A(s,m);
+  for (i = 0; i < ver; i++) {
+    if (i < len8) {
+      s[i] = s[i] ^ cp[i] ^ m[i];
+    }
+    else {
+      s[i] = s[i] ^ cp[i];
+    }
+    if (i < len8) {
+      m[i] = m[i] ^ cp[i];
+    }
+    else {
+      m[i] = 0;
+    }
+  }
+}
+
+void reset_lfsr_gf56 (unsigned char* CNT) {
+    *((unsigned int *)&CNT[0]) = 0x00000001;
+    *((unsigned int *)&CNT[4]) = 0x00000000;
+}
+
+void lfsr_gf56 (unsigned char* CNT) {
+    unsigned int tmpCNT[2];
+    unsigned int fb0;
+
+    tmpCNT[0] = *((unsigned int *)&CNT[0]);	// CNT3 CNT2 CNT1 CNT0
+    tmpCNT[1] = *((unsigned int *)&CNT[4]);	// CNT7 CNT6 CNT5 CNT4
+
+    fb0 = 0;
+    if ((tmpCNT[1] >> 23)&0x01) {
+        fb0 =  0x95;
+    }
+
+    tmpCNT[1] = tmpCNT[1] << 1 | tmpCNT[0] >> 31;
+    tmpCNT[0] = tmpCNT[0] << 1 ^ fb0;
+
+    *((unsigned int *)&CNT[0]) = tmpCNT[0];
+    *((unsigned int *)&CNT[4]) = tmpCNT[1];
+}
+
+void block_cipher(unsigned char* s,
+		  const unsigned char* k, unsigned char* T,
+		  unsigned char* CNT,
+		  skinny_ctrl* p_skinny_ctrl) {
+  p_skinny_ctrl->func_skinny_128_384_enc (s,p_skinny_ctrl,CNT,T,k);
+}
+
+void nonce_encryption (const unsigned char* N,
+		       unsigned char* CNT,
+		       unsigned char*s, const unsigned char* k,
+		       unsigned char D,
+			   skinny_ctrl* p_skinny_ctrl) {
+  unsigned char T [16];
+
+  *((unsigned int *)&T[0])  = *((unsigned int *)&N[0]);
+  *((unsigned int *)&T[4])  = *((unsigned int *)&N[4]);
+  *((unsigned int *)&T[8])  = *((unsigned int *)&N[8]);
+  *((unsigned int *)&T[12]) = *((unsigned int *)&N[12]);
+  CNT[7] = D;
+  block_cipher(s,k,T,CNT,p_skinny_ctrl);
+
+}
+
+void generate_tag (unsigned char** c, unsigned char* s,
+		   int n, unsigned long long* clen) {
+  g8A_for_Tag_Generation(s, *c);
+  *c = *c + n;
+  *c = *c - *clen;
+}
+
+unsigned long long msg_encryption_eqov16 (const unsigned char** M, unsigned char** c,
+				   const unsigned char* N,
+				   unsigned char* CNT,
+				   unsigned char*s, const unsigned char* k,
+				   unsigned char D,
+				   unsigned long long mlen,
+				   skinny_ctrl* p_skinny_ctrl) {
+  rho_eqov16(*M, *c, s);
+  *c = *c + 16;
+  *M = *M + 16;
+  lfsr_gf56(CNT);
+  nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl);
+  return mlen - 16;
+}
+
+unsigned long long msg_encryption_ud16 (const unsigned char** M, unsigned char** c,
+				   const unsigned char* N,
+				   unsigned char* CNT,
+				   unsigned char*s, const unsigned char* k,
+				   unsigned char D,
+				   unsigned long long mlen,
+				   skinny_ctrl* p_skinny_ctrl) {
+  rho_ud16(*M, *c, s, mlen, 16);
+  *c = *c + mlen;
+  *M = *M + mlen;
+  lfsr_gf56(CNT);
+  nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl);
+  return 0;
+}
+
+unsigned long long msg_decryption (unsigned char** M, const unsigned char** c,
+				   const unsigned char* N,
+				   unsigned char* CNT,
+				   unsigned char*s, const unsigned char* k,
+				   unsigned char D,
+				   unsigned long long clen,
+				   skinny_ctrl* p_skinny_ctrl) {
+  int len8;
+
+  if (clen >= 16) {
+    len8 = 16;
+    clen = clen - 16;
+  }
+  else {
+    len8 = clen;
+    clen = 0;
+  }
+  irho(*M, *c, s, len8, 16);
+  *c = *c + len8;
+  *M = *M + len8;
+  lfsr_gf56(CNT);
+  nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl);
+  return clen;
+}
+
+unsigned long long ad_encryption_eqov32 (const unsigned char** A, unsigned char* s,
+				  const unsigned char* k, unsigned long long adlen,
+				  unsigned char* CNT,
+				  unsigned char D,
+				   skinny_ctrl* p_skinny_ctrl) {
+
+  unsigned char T [16];
+
+  rho_ad_eqov16(*A, s);
+  *A = *A + 16;
+  lfsr_gf56(CNT);
+
+  //pad(*A, T, 16, 16);
+  *((unsigned int *)&T[0])  = *((unsigned int *)&(*A)[0]);
+  *((unsigned int *)&T[4])  = *((unsigned int *)&(*A)[4]);
+  *((unsigned int *)&T[8])  = *((unsigned int *)&(*A)[8]);
+  *((unsigned int *)&T[12]) = *((unsigned int *)&(*A)[12]);
+  *A = *A + 16;
+  CNT[7] = D;
+  block_cipher(s,k,T,CNT,p_skinny_ctrl);
+  lfsr_gf56(CNT);
+
+  return adlen - 32;
+}
+
+unsigned long long ad_encryption_ov16 (const unsigned char** A, unsigned char* s,
+				  const unsigned char* k, unsigned long long adlen,
+				  unsigned char* CNT,
+				  unsigned char D,
+				   skinny_ctrl* p_skinny_ctrl) {
+
+  unsigned char T [16];
+
+  adlen = adlen - 16;
+  rho_ad_eqov16(*A, s);
+  *A = *A + 16;
+  lfsr_gf56(CNT);
+
+  pad(*A, T, 16, adlen);
+  *A = *A + adlen;
+  CNT[7] = D;
+  block_cipher(s,k,T,CNT,p_skinny_ctrl);
+  lfsr_gf56(CNT);
+
+  return 0;
+}
+
+unsigned long long ad_encryption_eq16 (const unsigned char** A, unsigned char* s,
+				  unsigned char* CNT) {
+
+  rho_ad_eqov16(*A, s);
+  *A = *A + 16;
+  lfsr_gf56(CNT);
+
+  return 0;
+}
+
+unsigned long long ad_encryption_ud16 (const unsigned char** A, unsigned char* s,
+				  unsigned long long adlen,
+				  unsigned char* CNT) {
+
+  rho_ad_ud16(*A, s, adlen);
+  *A = *A + adlen;
+  lfsr_gf56(CNT);
+
+  return 0;
+}
+
+int crypto_aead_encrypt (
+			 unsigned char* c, unsigned long long* clen,
+			 const unsigned char* m, unsigned long long mlen,
+			 const unsigned char* ad, unsigned long long adlen,
+			 const unsigned char* nsec,
+			 const unsigned char* npub,
+			 const unsigned char* k
+			 )
+{
+  unsigned char s[16];
+  // size 7 -> 8 for word access
+  unsigned char CNT[8];
+  const unsigned char* A;
+  const unsigned char* M;
+  const unsigned char* N;
+
+  skinny_ctrl l_skinny_ctrl;
+
+  (void) nsec;
+  A = ad;
+  M = m;
+  N = npub;
+
+  l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12;
+
+  *((unsigned int *)&s[0])  = 0x00000000;
+  *((unsigned int *)&s[4])  = 0x00000000;
+  *((unsigned int *)&s[8])  = 0x00000000;
+  *((unsigned int *)&s[12]) = 0x00000000;
+  reset_lfsr_gf56(CNT);
+
+  if (adlen == 0) { // AD is an empty string
+    lfsr_gf56(CNT);
+    nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl);
+  }
+  else while (adlen > 0) {
+      if (adlen < 16) { // The last block of AD is odd and incomplete
+         adlen = ad_encryption_ud16(&A,s,adlen,CNT);
+         nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl);
+      }
+      else if (adlen == 16) { // The last block of AD is odd and complete
+         adlen = ad_encryption_eq16(&A,s,CNT);
+         nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl);
+      }
+      else if (adlen < (32)) { // The last block of AD is even and incomplete
+         adlen = ad_encryption_ov16(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl);
+         nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl);
+      }
+      else if (adlen == (32)) { // The last block of AD is even and complete
+         adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl);
+         nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl);
+      }
+      else { // A normal full pair of blocks of AD
+         adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl);
+      }
+    }
+
+  // because, nonce_encryption is called at the last block of AD encryption
+  l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc1_1;
+
+  reset_lfsr_gf56(CNT);
+
+  *clen = mlen + 16;
+  
+  if (mlen == 0) { // M is an empty string
+    lfsr_gf56(CNT);
+    nonce_encryption(N,CNT,s,k,0x15,&l_skinny_ctrl);
+  }  
+  else while (mlen > 0) {
+    if (mlen < 16) { // The last block of M is incomplete
+      mlen = msg_encryption_ud16(&M,&c,N,CNT,s,k,0x15,mlen,&l_skinny_ctrl);
+      }
+    else if (mlen == 16) { // The last block of M is complete
+      mlen = msg_encryption_eqov16(&M,&c,N,CNT,s,k,0x14,mlen,&l_skinny_ctrl);
+    }
+    else { // A normal full message block
+      mlen = msg_encryption_eqov16(&M,&c,N,CNT,s,k,0x04,mlen,&l_skinny_ctrl);
+    }
+  }
+
+  // Tag generation 
+  generate_tag(&c,s,16,clen);
+
+  return 0;
+}
+
+int crypto_aead_decrypt(
+unsigned char *m,unsigned long long *mlen,
+unsigned char *nsec,
+const unsigned char *c,unsigned long long clen,
+const unsigned char *ad,unsigned long long adlen,
+const unsigned char *npub,
+const unsigned char *k
+)
+{
+
+  unsigned char s[16];
+  unsigned char T[16];
+  // size 7 -> 8 for word access
+  unsigned char CNT[8];
+  const unsigned char* A;
+  unsigned char* M;
+  const unsigned char* N;
+  unsigned int i;
+
+  skinny_ctrl l_skinny_ctrl;
+
+  (void) nsec;
+  A = ad;
+  M = m;
+  N = npub;
+  
+  l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12;
+
+  for (i = 0; i < 16; i++) {
+    s[i] = 0;
+  }
+  reset_lfsr_gf56(CNT);
+
+  if (adlen == 0) { // AD is an empty string
+    lfsr_gf56(CNT);
+    nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl);
+  }
+  else while (adlen > 0) {
+      if (adlen < 16) { // The last block of AD is odd and incomplete
+        adlen = ad_encryption_ud16(&A,s,adlen,CNT);
+        nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl);
+      }
+      else if (adlen == 16) { // The last block of AD is odd and complete
+        adlen = ad_encryption_eq16(&A,s,CNT);
+        nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl);
+      }
+      else if (adlen < (32)) { // The last block of AD is even and incomplete
+        adlen = ad_encryption_ov16(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl);
+        nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl);
+      }
+      else if (adlen == (32)) { // The last block of AD is even and complete
+        adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl);
+        nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl);
+      }
+      else { // A normal full pair of blocks of AD
+        adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl);
+      }
+    }
+
+  reset_lfsr_gf56(CNT);
+  
+  clen = clen - 16;
+  *mlen = clen;
+
+  if (clen == 0) { // C is an empty string
+    lfsr_gf56(CNT);
+    nonce_encryption(N,CNT,s,k,0x15,&l_skinny_ctrl);
+  }  
+  else while (clen > 0) {
+    if (clen < 16) { // The last block of C is incomplete
+      clen = msg_decryption(&M,&c,N,CNT,s,k,0x15,clen,&l_skinny_ctrl);
+    }
+    else if (clen == 16) { // The last block of C is complete
+      clen = msg_decryption(&M,&c,N,CNT,s,k,0x14,clen,&l_skinny_ctrl);
+    }
+    else { // A normal full message block
+      clen = msg_decryption(&M,&c,N,CNT,s,k,0x04,clen,&l_skinny_ctrl);
+    }
+  }
+
+  // Tag generation
+  g8A_for_Tag_Generation(s, T);
+  for (i = 0; i < 16; i++) {
+    if (T[i] != (*(c+i))) {
+      return -1;
+    }    
+  }
+  
+  return 0;
+}
+
+
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/genkat_aead.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/genkat_aead.c
+//
+// NIST-developed software is provided by NIST as a public service.
+// You may use, copy and distribute copies of the software in any medium,
+// provided that you keep intact this entire notice. You may improve, 
+// modify and create derivative works of the software or any portion of
+// the software, and you may copy and distribute such modifications or
+// works. Modified works should carry a notice stating that you changed
+// the software and should note the date and nature of any such change.
+// Please explicitly acknowledge the National Institute of Standards and 
+// Technology as the source of the software.
+//
+// NIST-developed software is expressly provided "AS IS." NIST MAKES NO 
+// WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT OR ARISING BY OPERATION
+// OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT AND DATA ACCURACY. NIST
+// NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE 
+// UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST 
+// DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE
+// OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY,
+// RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+//
+// You are solely responsible for determining the appropriateness of using and 
+// distributing the software and you assume all risks associated with its use, 
+// including but not limited to the risks and costs of program errors, compliance 
+// with applicable laws, damage to or loss of data, programs or equipment, and 
+// the unavailability or interruption of operation. This software is not intended
+// to be used in any situation where a failure could cause risk of injury or 
+// damage to property. The software developed by NIST employees is not subject to
+// copyright protection within the United States.
+//
+
+// disable deprecation for sprintf and fopen
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include "crypto_aead.h"
+#include "api.h"
+
+#define KAT_SUCCESS          0
+#define KAT_FILE_OPEN_ERROR -1
+#define KAT_DATA_ERROR      -3
+#define KAT_CRYPTO_FAILURE  -4
+
+#define MAX_FILE_NAME				256
+#define MAX_MESSAGE_LENGTH			32
+#define MAX_ASSOCIATED_DATA_LENGTH	32
+
+void init_buffer(unsigned char *buffer, unsigned long long numbytes);
+
+void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length);
+
+int generate_test_vectors();
+
+int main()
+{
+	int ret = generate_test_vectors();
+
+	if (ret != KAT_SUCCESS) {
+		fprintf(stderr, "test vector generation failed with code %d\n", ret);
+	}
+
+	return ret;
+}
+
+int generate_test_vectors()
+{
+	FILE                *fp;
+	char                fileName[MAX_FILE_NAME];
+	unsigned char       key[CRYPTO_KEYBYTES];
+	unsigned char		nonce[CRYPTO_NPUBBYTES];
+	unsigned char       msg[MAX_MESSAGE_LENGTH];
+	unsigned char       msg2[MAX_MESSAGE_LENGTH];
+	unsigned char		ad[MAX_ASSOCIATED_DATA_LENGTH];
+	unsigned char		ct[MAX_MESSAGE_LENGTH + CRYPTO_ABYTES];
+	unsigned long long  clen, mlen2;
+	int                 count = 1;
+	int                 func_ret, ret_val = KAT_SUCCESS;
+
+	init_buffer(key, sizeof(key));
+	init_buffer(nonce, sizeof(nonce));
+	init_buffer(msg, sizeof(msg));
+	init_buffer(ad, sizeof(ad));
+
+	sprintf(fileName, "LWC_AEAD_KAT_%d_%d.txt", (CRYPTO_KEYBYTES * 8), (CRYPTO_NPUBBYTES * 8));
+
+	if ((fp = fopen(fileName, "w")) == NULL) {
+		fprintf(stderr, "Couldn't open <%s> for write\n", fileName);
+		return KAT_FILE_OPEN_ERROR;
+	}
+
+	for (unsigned long long mlen = 0; (mlen <= MAX_MESSAGE_LENGTH) && (ret_val == KAT_SUCCESS); mlen++) {
+	  for (unsigned long long adlen = 0; adlen <= MAX_ASSOCIATED_DATA_LENGTH; adlen++) {
+
+	    printf("%0d\n", (int)clen);
+
+			fprintf(fp, "Count = %d\n", count++);
+			printf("Count = %d\n", count - 1);
+
+			fprint_bstr(fp, "Key = ", key, CRYPTO_KEYBYTES);
+
+			fprint_bstr(fp, "Nonce = ", nonce, CRYPTO_NPUBBYTES);
+
+			fprint_bstr(fp, "PT = ", msg, mlen);
+
+			fprint_bstr(fp, "AD = ", ad, adlen);
+
+			if ((func_ret = crypto_aead_encrypt(ct, &clen, msg, mlen, ad, adlen, NULL, nonce, key)) != 0) {
+				fprintf(fp, "crypto_aead_encrypt returned <%d>\n", func_ret);
+				ret_val = KAT_CRYPTO_FAILURE;
+				break;
+			}
+			
+			fprint_bstr(fp, "CT = ", ct, clen);
+
+			fprintf(fp, "\n");
+
+			 if ((func_ret = crypto_aead_decrypt(msg2, &mlen2, NULL, ct, clen, ad, adlen, nonce, key)) != 0) { 
+			 	fprintf(fp, "crypto_aead_decrypt returned <%d>\n", func_ret); 
+			 	ret_val = KAT_CRYPTO_FAILURE; 
+			 	break; 
+			 } 
+
+			 if (mlen != mlen2) { 
+			 	fprintf(fp, "crypto_aead_decrypt returned bad 'mlen': Got <%llu>, expected <%llu>\n", mlen2, mlen); 
+			 	ret_val = KAT_CRYPTO_FAILURE; 
+			 	break; 
+			 } 
+
+			 if (memcmp(msg, msg2, mlen)) { 
+			 	fprintf(fp, "crypto_aead_decrypt did not recover the plaintext\n"); 
+			 	ret_val = KAT_CRYPTO_FAILURE; 
+			 	break; 
+			 } 
+		}
+	}
+
+	fclose(fp);
+
+	return ret_val;
+}
+
+
+void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length)
+{    
+    fprintf(fp, "%s", label);
+        
+	for (unsigned long long i = 0; i < length; i++)
+		fprintf(fp, "%02X", data[i]);
+	    
+    fprintf(fp, "\n");
+}
+
+void init_buffer(unsigned char *buffer, unsigned long long numbytes)
+{
+	for (unsigned long long i = 0; i < numbytes; i++)
+		buffer[i] = (unsigned char)i;
+}
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny.h
+typedef struct ___skinny_ctrl {
+    unsigned char roundKeys[960]; // number of round : 56
+    void (*func_skinny_128_384_enc)(unsigned char*, struct ___skinny_ctrl*, unsigned char* CNT, unsigned char* T, const unsigned char* K);
+} skinny_ctrl;
+
+extern void skinny_128_384_enc123_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K);
+extern void skinny_128_384_enc12_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K);
+extern void skinny_128_384_enc1_1 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K);
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule2.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule2.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule3.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule3.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_main.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_main.c