Merge branch 'email-submissions'

da92cb38 · Enrico Pozzobon · 90acf8b3 · f9e2581f · da92cb38 · da92cb38
Commit da92cb38 authored May 14, 2020 by Enrico Pozzobon
131 changed files
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/LWC_AEAD_KAT_128_128.txt
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/LWC_AEAD_KAT_128_128.txt
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/api.h
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/encrypt.c
+#include "skinny128.h"
+#include "romulus.h"
+#include <string.h>
+
+u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
+    u8 domain = 0;
+    u32 leftover;
+    //Determine which domain bits we need based on the length of the ad
+    if (adlen == 0) {
+        domain ^= 0x02;         // No message, so only 1 block with padding
+    } else {
+        leftover = (u32)(adlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd ad length?
+            domain ^= 0x08;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x02;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x0A;     // Even with a partial double block at the end
+        }
+    }
+    //Determine which domain bits we need based on the length of the message
+    if (mlen == 0) {
+        domain ^= 0x01;         // No message, so only 1 block with padding
+    } else {
+        leftover = (u32)(mlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd message length?
+            domain ^= 0x04;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x01;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x05;     // Even with a partial double block at the end
+        }
+    }
+    return domain;
+}
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u64 tmp_mlen = mlen;
+    u32 tmp;
+    const u8* m_auth = m;
+    u8 final_domain = 0x30;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    final_domain ^= final_ad_domain(adlen, mlen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (tmp_mlen >= BLOCKBYTES) {
+            tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            if (tmp_mlen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            m_auth += BLOCKBYTES;
+            tmp_mlen -= BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, tmp_mlen);
+            memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+            pad[15] = (u8)tmp_mlen;             // Padding
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            tmp_mlen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (tmp_mlen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        tmp_mlen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (tmp_mlen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (tmp_mlen > BLOCKBYTES) {         // Last message double block is partial
+        tmp_mlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
+        memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+        pad[15] = (u8)tmp_mlen;                 // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (tmp_mlen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (tmp_mlen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)tmp_mlen; i++)
+            state[i] ^= m_auth[i];
+        state[15] ^= (u8)tmp_mlen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(tks.rtk);
+    tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+    skinny128_384(state, tks.rtk, state, tks.rtk1);
+    // ----------------- Process the associated data -----------------
+
+
+    // ----------------- Generate the tag -----------------
+    G(state,state);
+    memcpy(c + mlen, state, TAGBYTES);
+    // ----------------- Generate the tag -----------------
+
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (mlen > 0) {
+        SET_DOMAIN(tks, 0x24);
+        while (mlen > BLOCKBYTES) {
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        for(int i = 0; i < (int)mlen; i++) {
+            tmp = m[i];                     // Use of tmp variable in case c = m
+            c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= (u8)tmp;
+        }
+        state[15] ^= (u8)mlen;              // Padding
+    }
+
+    return 0;
+}
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp, tmp_mlen;
+    u8 final_domain = 0x30;
+    u8* m_auth = m;
+    const u8* c_tmp = c;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memcpy(state, c + clen, TAGBYTES);
+    tmp_mlen = clen;
+    if (tmp_mlen > 0) {
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        SET_DOMAIN(tks, 0x24);
+        while (tmp_mlen > BLOCKBYTES) {
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            RHO_INV(state, c, m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            tmp_mlen -= BLOCKBYTES;
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        for(int i = 0; i < (int)tmp_mlen; i++) {
+            m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= m[i];
+        }
+        state[15] ^= (u8)tmp_mlen;          // Padding
+    }
+    // ----------------- Process the ciphertext -----------------
+    
+    // ----------------- Process the associated data -----------------
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    memset(state, 0x00, BLOCKBYTES);
+    final_domain ^= final_ad_domain(adlen, clen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (clen >= BLOCKBYTES) {
+            tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            if (clen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            m_auth += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, clen);
+            memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+            pad[15] = (u8)clen;             // Padding
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            clen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (clen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        clen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (clen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (clen > BLOCKBYTES) {         // Last message double block is partial
+        clen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, clen);
+        memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+        pad[15] = (u8)clen;                 // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (clen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (clen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)clen; i++)
+            state[i] ^= m[i];
+        state[15] ^= (u8)clen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(tks.rtk);
+    tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+    skinny128_384(state, tks.rtk, state, tks.rtk1);
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(int i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c_tmp[*mlen+i];   //constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk[4*SKINNY128_384_ROUNDS];    //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/armcortexm/skinny128.s
+/*******************************************************************************
+* ARM assembly implementation of fixsliced SKINNY-128-384.
+*
+* For more details, see the paper at: https://
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+*******************************************************************************/
+
+.syntax unified
+.thumb
+
+/*******************************************************************************
+* applies P^2 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p2:
+	movw 	r1, #0xcc00
+	movt 	r1, #0xcc00 				//r1 <- 0xcc00cc00
+	movw 	r10, #0xcc00
+	movt 	r10, #0x0033 				//r10<- 0xcc000033
+	and 	r11, r1, r6, ror #14
+	bfi 	r11, r6, #16, #8
+	and 	r12, r6, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r6
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r6, #0x00cc0000
+	orr 	r6, r11, r12, lsr #18
+	and 	r11, r1, r7, ror #14
+	bfi 	r11, r7, #16, #8
+	and 	r12, r7, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r7
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r7, #0x00cc0000
+	orr 	r7, r11, r12, lsr #18
+	and 	r11, r1, r8, ror #14
+	bfi 	r11, r8, #16, #8
+	and 	r12, r8, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r8
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r8, #0x00cc0000
+	orr 	r8, r11, r12, lsr #18
+	and 	r11, r1, r9, ror #14
+	bfi 	r11, r9, #16, #8
+	and 	r12, r9, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r9
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r9, #0x00cc0000
+	orr 	r9, r11, r12, lsr #18
+	bx 		lr
+
+/*******************************************************************************
+* applies P^4 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p4:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0x00cc
+	movt 	r14, #0xcc00 				//r14<- 0xcc0000cc
+	movw 	r12, #0xcc00
+	movt 	r12, #0x3300 				//r12<- 0x3300cc00
+	movw 	r11, #0x00cc
+	movt 	r11, #0x00cc 				//r11<- 0x00cc00cc
+ 	and 	r10, r14, r6, ror #22
+ 	and 	r1, r12, r6, ror #16
+ 	orr 	r10, r10,  r1
+ 	and 	r1, r6, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r6, r6, r1
+ 	orr 	r6, r10, r6, ror #24
+ 	and 	r10, r14, r7, ror #22
+ 	and 	r1, r12, r7, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r7, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r7, r7, r1
+ 	orr 	r7, r10, r7, ror #24
+ 	and 	r10, r14, r8, ror #22
+ 	and 	r1, r12, r8, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r8, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r8, r8, r1
+ 	orr 	r8, r10, r8, ror #24
+ 	and 	r10, r14, r9, ror #22
+ 	ldr.w 	r14, [sp] 					//restore r14
+ 	and 	r12, r12, r9, ror #16
+ 	orr 	r10, r10, r12
+ 	and 	r12, r9, r11
+ 	orr 	r10, r10, r12, lsr #2
+	movw 	r12, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r9, r9, r12
+ 	orr 	r9, r10, r9, ror #24
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^6 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p6:
+	movw 	r1, #0x3333 				//r1 <- 0x00003333
+	movw 	r12, #0x00cc
+	movt 	r12, #0x3300 				//r12<- 0x330000cc
+	and 	r10, r6, r1, ror #8 		// --- permute r6 6 times
+	and 	r11, r12, r6, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r6, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r6, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r6, #0x00003300
+	orr 	r6, r11, r10, lsl #2 		// permute r6 6 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r7 6 times
+	and 	r11, r12, r7, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r7, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r7, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r7, #0x00003300
+	orr 	r7, r11, r10, lsl #2 		// permute r7 6 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r8 6 times
+	and 	r11, r12, r8, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r8, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r8, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r8, #0x00003300
+	orr 	r8, r11, r10, lsl #2 		// permute r8 6 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r9 6 times
+	and 	r11, r12, r9, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r9, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r9, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r9, #0x00003300 		// permute r9 6 times ---
+	orr 	r9, r11, r10, lsl #2
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^8 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p8:
+	movw 	r12, #0x3333 				//r12<- 0x00003333
+	movw 	r1, #0x0000
+	movt 	r1, #0x33cc 				//r1 <- 0x33cc0000
+	and 	r10, r6, r1 				// --- permute r6 8 times
+	and 	r11, r1, r6, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r6, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r6, r12, lsl #8
+	orr 	r6, r11, r10, lsr #6 		// permute r6 8 times ---
+	and 	r10, r7, r1 				// --- permute r7 8 times
+	and 	r11, r1, r7, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r7, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r7, r12, lsl #8
+	orr 	r7, r11, r10, lsr #6 		// permute r7 8 times ---
+	and 	r10, r8, r1 				// --- permute r8 8 times
+	and 	r11, r1, r8, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r8, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r8, r12, lsl #8
+	orr 	r8, r11, r10, lsr #6 		// permute r8 8 times ---
+	and 	r10, r9, r1 				// --- permute r9 8 times
+	and 	r11, r1, r9, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r9, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r9, r12, lsl #8
+	orr 	r9, r11, r10, lsr #6 		// permute r9 8 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^10 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p10:
+	movw 	r12, #0x0033
+	movt 	r12, #0x3300 				//r12<- 0x33000033
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+	and 	r10, r6, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r6, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r6, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r6, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r6, #0x0000cc00
+	orr 	r6, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r7, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r7, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r7, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r7, #0x0000cc00
+	orr 	r7, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r8, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r8, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r8, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r8, #0x0000cc00
+	orr 	r8, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r9, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r9, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r9, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r9, #0x0000cc00
+	orr 	r9, r11, r10, lsr #2 		// permute r6 10 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^12 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p12:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0xcc33 				//r14<- 0x0000cc33
+	movw 	r12, #0x00cc
+	movt 	r12, #0x00cc 				//r12<- 0x00cc00cc
+	movw 	r1, #0x3300
+	movt 	r1, #0xcc00 				//r1 <- 0xcc003300
+	and 	r10, r14, r6, ror #8 		// --- permute r6 12 times
+	and 	r11, r12, r6, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r6, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r6, r10, ror #8
+	orr 	r6, r11, r10, ror #10 		// permute r6 12 times ---
+	and 	r10, r14, r7, ror #8 		// --- permute r7 12 times
+	and 	r11, r12, r7, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r7, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r7, r10, ror #8
+	orr 	r7, r11, r10, ror #10 		// permute r7 12 times ---
+	and 	r10, r14, r8, ror #8 		// --- permute r8 12 times
+	and 	r11, r12, r8, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r8, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r8, r10, ror #8
+	orr 	r8, r11, r10, ror #10 		// permute r8 12 times ---
+	and 	r10, r14, r9, ror #8 		// --- permute r9 12 times
+	and 	r11, r12, r9, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r9, ror #16
+	ldr.w 	r14, [sp]
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r9, r10, ror #8
+	orr 	r9, r11, r10, ror #10 		// permute r9 12 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^14 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p14:
+	movw 	r1, #0xcc00
+	movt 	r1, #0x0033 				//r1 <- 0x0033cc00
+	movw 	r12, #0xcc00
+	movt 	r12, #0xcc00 				//r12<- 0x33003300
+	and 	r10, r1, r6, ror #24 		// --- permute r6 14 times
+	and 	r11, r6, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r6, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r6, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r6, r12
+	orr 	r6, r11, r10, ror #18 		// permute r6 14 times ---
+	and 	r10, r1, r7, ror #24 		// --- permute r7 14 times
+	and 	r11, r7, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r7, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r7, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r7, r12
+	orr 	r7, r11, r10, ror #18 		// permute r7 14 times ---
+	and 	r10, r1, r8, ror #24 		// --- permute r8 14 times
+	and 	r11, r8, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r8, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r8, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r8, r12
+	orr 	r8, r11, r10, ror #18 		// permute r8 14 times ---
+	and 	r10, r1, r9, ror #24 		// --- permute r9 14 times
+	and 	r11, r9, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r9, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r9, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r9, r12
+	orr 	r9, r11, r10, ror #18 		// permute r9 14 times ---
+ 	bx 		lr
+
+.align 2
+packing:
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r10
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 		//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r10
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r10
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 		//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r10
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 		//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r11
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 		//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r11, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 		//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r11, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 		//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r11, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 		//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r11, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 		//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r11, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 		//SWAPMOVE(r5, r3, 0x03030303, 2)
+	bx 		lr
+
+/******************************************************************************
+* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
+* Performing both at the same time allows to save some memory accesses.
+******************************************************************************/
+@ void 	tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
+.global tkschedule_lfsr
+.type   tkschedule_lfsr,%function
+.align	2
+tkschedule_lfsr:
+	push 	{r0-r12, r14}
+	ldr.w 	r3, [r1, #8] 				//load tk2 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk2 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk2 (4th word)
+	ldr.w 	r12, [r1] 					//load tk2 (1st word)
+	mov 	r1, r2 						//move tk3 address in r1
+	mov 	r2, r12 					//move 1st tk2 word in r2
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk2
+	mov 	r6, r2 						//move tk2 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk2 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk2 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk2 from r2-r5 to r6-r9
+	ldr.w 	r3, [r1, #8] 				//load tk3 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk3 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk3 (4th) word)
+	ldr.w 	r2, [r1] 					//load tk3 (1st) word)
+	bl 		packing 					//pack tk3
+	eor 	r10, r10, r10, lsl #4 		//r10<- 0xaaaaaaaa
+	ldr.w 	r1, [sp, #12] 				//load loop counter in r1
+	eor 	r11, r2, r6 				//tk2 ^ tk3 (1st word)
+	eor 	r12, r3, r7 				//tk2 ^ tk3 (2nd word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	eor 	r11, r4, r8 				//tk2 ^ tk3 (3rd word)
+	eor 	r12, r5, r9					//tk2 ^ tk3 (4th word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	loop:
+		and 	r12, r8, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r6
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r6, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r3, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r5, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r5, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r2, r8 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r4, r6				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r9, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r7
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r7, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r2, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r4, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r4, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r5, r9 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r3, r7				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r6, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r8
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r8, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r5, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r3, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r3, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r4, r6 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r2, r8				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r7, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r9
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r9, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r4, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r2, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r2, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r3, r7 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r5, r9				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		subs.w 	r1, r1, #8 				//decrease loop counter by 8
+		bne 	loop
+	pop 	{r0-r12, r14}
+	bx 		lr
+
+@ void 	tkschedule_perm(u32* tk)
+.global tkschedule_perm
+.type   tkschedule_perm,%function
+.align	2
+tkschedule_perm:
+	push 	{r0-r12, lr}
+	sub.w 	sp, #4 						//to store r14 in subroutines
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r6, r6, r10 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r7, r7, r10 				//tk &= 0xf0f0f0f0 (2nd word)
+	and 	r8, r8, r10 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r9, r9, r10 				//tk &= 0xf0f0f0f0 (4th word)
+	eor 	r8, r8, #0x00000004 		//add rconst
+	eor 	r9, r9, #0x00000040 		//add rconst
+	mvn 	r9, r9 						//to remove a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 1st round
+	strd 	r6, r7, [r0], #8  			//store 2nd half tk for 1st round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 3rd round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 3rd round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 5th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 5th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 4th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 4th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 7th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 7th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 9th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000054 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 9th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000410
+	eor 	r9, r9, #0x00000410
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 8th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 8th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x04000000 		//add rconst
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 11th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 11th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 13th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00140000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 13th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 12th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 12th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add const
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 15th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 15th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 17th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 17th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 16th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 16th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 18th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 18th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 19th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 19th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 21th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00100000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 21th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 20th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 20th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 22th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 22th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 23th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 23th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 25th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 25th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 24th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 24th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 26th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 26th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 27th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 27th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 29th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 29th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 28th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 28th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 30th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 30th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 31th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 31th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 33th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 33th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 32th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 32th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 34th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 34th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 35th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 35th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 37th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 37th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 36th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 36th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 38th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 38th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 39th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 39th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0x3030
+	movt 	r10, #0x3030 				//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 40th round
+	strd 	r8, r9, [r0] 			//store 2nd half tk for 40th round
+	add.w 	sp, #4
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
+* don't need more calculations as no LFSR is applied to TK1.
+******************************************************************************/
+@ void 	tkschedule_perm_tk1(u32* tk, const u8* key)
+.global tkschedule_perm_tk1
+.type   tkschedule_perm_tk1,%function
+.align	2
+tkschedule_perm_tk1:
+	push 	{r0-r12, lr}
+	ldr.w 	r3, [r1, #8] 				//load tk1 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk1 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk1 (4th word)
+	ldr.w 	r2, [r1] 					//load tk1 (1st word)
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r6 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk1
+	mov 	r6, r2 						//move tk1 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk1 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk1 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk1 from r2-r5 to r6-r9
+	movw 	r2, #0xf0f0
+	movt 	r2, #0xf0f0 				//r2<- 0xf0f0f0f0
+	and 	r11, r8, r2 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r12, r9, r2 				//tk &= 0xf0f0f0f0 (4th word)
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 1st round
+	and 	r11, r6, r2 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r12, r7, r2 				//tk &= 0xf0f0f0f0 (2nd word)
+	strd 	r11, r12, [r0], #8  			//store 2nd half tk for 1st round
+
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0x0303
+	movt 	r3, #0x0303 				//r3<- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 4 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 6 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 8 times
+	and 	r11, r2, r6 				//ror and mask to match fixslicing
+	and 	r12, r2, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 9th round
+	and 	r11, r2, r8 				//ror and mask to match fixslicing
+	and 	r12, r2, r9 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 9th round
+	bl 		p2 							//apply the permutation 10
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 12 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 14 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0] 					//ror and masks to match fixslicing ---
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+.align 2
+quadruple_round:
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #30 	// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #16		// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #10		// --- mixcolumns 2 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #26
+	eor 	r2, r2, r8
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia 	r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #4		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	bx 		lr
+
+/******************************************************************************
+* Encrypt a single block using fixsliced SKINNY-128-128.
+******************************************************************************/
+@ void 	skinny128_384(u8* ctext, const u32* tk, const u8* ptext)
+.global skinny128_384
+.type   skinny128_384,%function
+.align 2
+skinny128_384:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	movw 	r7, #0x3030
+	movt 	r7, #0x3030 			//r7 <- 0x30303030
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r6
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r6
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 	//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r6
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r6
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r7
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r7, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r7, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r7, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r7, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r7, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	eor 	r10, r3, r5, lsr #2
+	and 	r10, r10, r7, lsr #4
+	eor 	r3, r3, r10
+	eor 	r5, r5, r10, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	eor 	r10, r4, r5, lsr #4
+	and 	r10, r10, r7, lsr #4
+	eor 	r4, r4, r10
+	eor 	r5, r5, r10, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r10, r4, r3, lsr #2
+	and 	r10, r10, r7, lsr #2
+	eor 	r4, r4, r10
+	eor 	r3, r3, r10, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r10, r2, r5, lsr #6
+	and 	r10, r10, r7, lsr #4
+	eor 	r2, r2, r10
+	eor 	r5, r5, r10, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r10, r2, r3, lsr #4
+	and 	r10, r10, r7, lsr #2
+	eor 	r2, r2, r10
+	eor 	r3, r3, r10, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r10, r2, r4, lsr #2
+	and 	r10, r10, r7
+	eor 	r2, r2, r10
+	eor 	r4, r4, r10, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r10, r5, r5, lsr #3
+	and 	r10, r10, r6
+	eor 	r5, r5, r10
+	eor 	r5, r5, r10, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r10, r4, r4, lsr #3
+	and 	r10, r10, r6
+	eor 	r4, r4, r10
+	eor 	r4, r4, r10, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r10, r3, r3, lsr #3
+	and 	r10, r10, r6
+	eor 	r3, r3, r10
+	eor 	r3, r3, r10, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r10, r2, r2, lsr #3
+	and 	r10, r10, r6
+	eor 	r2, r2, r10
+	eor 	r2, r2, r10, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr
+    
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/api.h
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/encrypt.c
+#include "skinny128.h"
+#include "tk_schedule.h"
+#include "romulus.h"
+#include <string.h>
+#include <stdio.h>
+
+static u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
+    u8 domain = 0;
+    u32 leftover;
+    //Determine which domain bits we need based on the length of the ad
+    if (adlen == 0) {
+        domain ^= 0x02;         // No message, so only 1 block with padding
+    } else {
+        leftover = (u32)(adlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd ad length?
+            domain ^= 0x08;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x02;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x0A;     // Even with a partial double block at the end
+        }
+    }
+    //Determine which domain bits we need based on the length of the message
+    if (mlen == 0) {
+        domain ^= 0x01;         // No message, so only 1 block with padding
+    } else {
+        leftover = (unsigned)(mlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd message length?
+            domain ^= 0x04;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x01;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x05;     // Even with a partial double block at the end
+        }
+    }
+    return domain;
+}
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    u64 tmp_mlen = mlen;
+    const u8* m_auth = m;
+    u8 final_domain = 0x30;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    final_domain ^= final_ad_domain(adlen, mlen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3); 
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (tmp_mlen >= BLOCKBYTES) {
+            precompute_rtk2_3(tks.rtk2_3, m_auth, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            if (tmp_mlen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            tmp_mlen -= BLOCKBYTES;
+            m_auth += BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, tmp_mlen);
+            memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+            pad[15] = (u8)tmp_mlen;             // Padding
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            tmp_mlen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (tmp_mlen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        tmp_mlen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (tmp_mlen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (tmp_mlen > BLOCKBYTES) {         // Last message double block is partial
+        tmp_mlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
+        memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+        pad[15] = (u8)tmp_mlen;                 // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (tmp_mlen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (tmp_mlen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)tmp_mlen; i++)
+            state[i] ^= m_auth[i];
+        state[15] ^= (u8)tmp_mlen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    precompute_rtk2_3(tks.rtk2_3, npub, k);
+    precompute_rtk1(tks.rtk1, tks.tk1);
+    skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3); 
+    // ----------------- Process the associated data -----------------
+
+
+    // ----------------- Generate the tag -----------------
+    G(state,state);
+    memcpy(c + mlen, state, TAGBYTES);
+    // ----------------- Generate the tag -----------------
+
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (mlen > 0) {
+        SET_DOMAIN(tks, 0x24);
+        while (mlen > BLOCKBYTES) {
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        for(int i = 0; i < (int)mlen; i++) {
+            tmp = m[i];                     // Use of tmp variable in case c = m
+            c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= (u8)tmp;
+        }
+        state[15] ^= (u8)mlen;              // Padding
+    }
+
+    return 0;
+}
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    u64 tmp_mlen;
+    u8 final_domain = 0x30;
+    u8* m_auth = m;
+    const u8* c_tmp = c;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                          // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memcpy(state, c + clen, TAGBYTES);
+    tmp_mlen = clen;
+    if (tmp_mlen > 0) {
+        SET_DOMAIN(tks, 0x24);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        while (tmp_mlen > BLOCKBYTES) {
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            RHO_INV(state, c, m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            tmp_mlen -= BLOCKBYTES;
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        for(int i = 0; i < (int)tmp_mlen; i++) {
+            m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= m[i];
+        }
+        state[15] ^= (u8)tmp_mlen;          // Padding
+    }
+    // ----------------- Process the ciphertext -----------------
+    
+    // ----------------- Process the associated data -----------------
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    memset(state, 0x00, BLOCKBYTES);
+    final_domain ^= final_ad_domain(adlen, clen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (clen >= BLOCKBYTES) {
+            precompute_rtk2_3(tks.rtk2_3, m_auth, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            if (clen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            m_auth += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, clen);
+            memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+            pad[15] = (u8)clen;             // Padding
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            clen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (clen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        clen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (clen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (clen > BLOCKBYTES) {         // Last message double block is partial
+        clen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, clen);
+        memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+        pad[15] = (u8)clen;                 // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (clen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (clen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)clen; i++)
+            state[i] ^= m[i];
+        state[15] ^= (u8)clen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    precompute_rtk2_3(tks.rtk2_3, npub, k);
+    precompute_rtk1(tks.rtk1, tks.tk1);
+    skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(int i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c_tmp[*mlen+i];   //constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,  
+					const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/tk_schedule.c
+/******************************************************************************
+* Implementation of the SKINNY tweakey schedule to match fixslicing.
+* 
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h> 		//for memcmp
+#include "tk_schedule.h"
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/******************************************************************************
+* The round constants according to the new representation.
+******************************************************************************/
+u32 rconst_32_bs[160] = {
+	0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
+	0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
+	0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
+	0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
+	0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
+	0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
+	0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
+	0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
+	0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff, 
+	0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
+	0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
+	0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
+	0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
+	0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
+	0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
+	0x00000010, 0x00000000, 0x00010010, 0xfffffbff
+};
+
+/******************************************************************************
+* 	Pack the input into the bitsliced representation
+* 	24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
+* 	25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
+* 	26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
+* 	27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
+******************************************************************************/
+void packing(u32* out, const u8* in) {
+	u32 tmp;
+	LE_LOAD(out, in);
+	LE_LOAD(out + 1, in + 8);
+	LE_LOAD(out + 2, in + 4);
+	LE_LOAD(out + 3, in + 12);
+	SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[3], out[0], 0x03030303, 6);
+	SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[3], out[2], 0x03030303, 4);
+	SWAPMOVE(out[3], out[1], 0x03030303, 2);
+}
+
+/******************************************************************************
+* Unpack the input to a byte-wise representation
+******************************************************************************/
+void unpacking(u8* out, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[3], in[1], 0x03030303, 2);
+	SWAPMOVE(in[3], in[2], 0x03030303, 4);
+	SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[3], in[0], 0x03030303, 6);
+	SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
+	LE_STORE(out, in[0]);
+	LE_STORE(out + 8, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out + 12, in[3]);
+}
+
+/******************************************************************************
+* 	0 4        1 5
+* 	1 5  --->  2 6
+* 	2 6        3 7
+* 	3 7        4 0
+******************************************************************************/
+void lfsr2_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[0] = tk[1];
+	tk[1] = tk[2];
+	tk[2] = tk[3];
+	tk[3] = tmp;
+}
+
+/******************************************************************************
+* 	0 4        7 3
+* 	1 5  --->  0 4
+* 	2 6        1 5
+* 	3 7        2 6
+******************************************************************************/
+void lfsr3_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[3] = tk[2];
+	tk[2] = tk[1];
+	tk[1] = tk[0];
+	tk[0] = tmp;
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, twice
+******************************************************************************/
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 4 times
+******************************************************************************/
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 6 times
+******************************************************************************/
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 8 times
+******************************************************************************/
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 10 times
+******************************************************************************/
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 12 times
+******************************************************************************/
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 14 times
+******************************************************************************/
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00;
+		tk[i] |= ROR(tmp,14) & 0x00cc0000;
+		tk[i] |= ROR(tmp,30) & 0xcc000000;
+		tk[i] |= ROR(tmp,16) & 0x000000ff;
+		tk[i] |= ROR(tmp,18) & 0x33003300;
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK2
+******************************************************************************/
+void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
+	u32 tk2[4];
+	packing(tk2, key);
+	memcpy(tk, tk2, 16);
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr2_bs(tk2);
+		memcpy(tk+i*4+4, tk2, 16);
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK3
+******************************************************************************/
+void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
+	u32 tk3[4];
+	packing(tk3, key);
+	tk[0] ^= tk3[0];
+	tk[1] ^= tk3[1];
+	tk[2] ^= tk3[2];
+	tk[3] ^= tk3[3];
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr3_bs(tk3);
+		tk[i*4+4] ^= tk3[0];
+		tk[i*4+5] ^= tk3[1];
+		tk[i*4+6] ^= tk3[2];
+		tk[i*4+7] ^= tk3[3];
+	}
+}
+
+/******************************************************************************
+* XOR TK with TK1 before applying the permutations.
+* The key is then rearranged to match the barrel shiftrows representation.
+******************************************************************************/
+void permute_tk(u32* tk, const u8* key, const int rounds) {
+	u32 test;
+	u32 tk1[4], tmp[4];
+	packing(tk1, key);
+	memcpy(tmp, tk, 16);
+	tmp[0] ^= tk1[0];
+	tmp[1] ^= tk1[1];
+	tmp[2] ^= tk1[2];
+	tmp[3] ^= tk1[3];
+	for(int i = 0 ; i < rounds; i += 8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		tk[i*4] = tmp[2] & 0xf0f0f0f0;
+		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
+		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
+		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+4, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 					// applies P^2
+		else
+			permute_tk_10(tmp); 				// applies P^10
+		tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
+		tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
+		tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+12, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 					// applies P^4
+		else
+			permute_tk_12(tmp); 				// applies P^12
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
+			tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
+		}
+		tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
+		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
+		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
+		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+20, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 					//	applies P^6
+		else
+			permute_tk_14(tmp); 				// applies P^14
+		tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
+		tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
+		tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
+		tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
+		tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
+		tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
+		tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+28, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 					// applies P^8
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
+			tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
+		}
+		if (test && (i+8 < rounds)) { 			//only if next loop iteration
+			tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
+			tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
+			tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
+			tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
+		}
+	}
+}
+
+/******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
+******************************************************************************/
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
+	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
+	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
+		for(int j = 0; j < 4; j++)
+			rtk[i*4+j] ^= rconst_32_bs[i*4+j];
+	}
+}
+
+/******************************************************************************
+* Precompute RTK1.
+******************************************************************************/
+void precompute_rtk1(u32* rtk1, const u8* tk1) {
+	memset(rtk1, 0x00, 16*16);
+	permute_tk(rtk1, tk1, 16);
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusm1+v13/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/encrypt.c
+#include "skinny128.h"
+#include "romulus.h"
+#include <string.h>
+
+u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
+    u8 domain = 0;
+    u32 leftover;
+    //Determine which domain bits we need based on the length of the ad
+    if (adlen == 0) {
+        domain ^= 0x02;         // No message, so only 1 block with padding
+    } else {
+        leftover = (u32)(adlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd ad length?
+            domain ^= 0x08;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x02;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x0A;     // Even with a partial double block at the end
+        }
+    }
+    //Determine which domain bits we need based on the length of the message
+    if (mlen == 0) {
+        domain ^= 0x01;         // No message, so only 1 block with padding
+    } else {
+        leftover = (u32)(mlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd message length?
+            domain ^= 0x04;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x01;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x05;     // Even with a partial double block at the end
+        }
+    }
+    return domain;
+}
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u64 tmp_mlen = mlen;
+    u32 tmp;
+    const u8* m_auth = m;
+    u8 final_domain = 0x30;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    final_domain ^= final_ad_domain(adlen, mlen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (tmp_mlen >= BLOCKBYTES) {
+            tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            if (tmp_mlen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            m_auth += BLOCKBYTES;
+            tmp_mlen -= BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, tmp_mlen);
+            memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+            pad[15] = (u8)tmp_mlen;             // Padding
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            tmp_mlen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (tmp_mlen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        tmp_mlen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (tmp_mlen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (tmp_mlen > BLOCKBYTES) {         // Last message double block is partial
+        tmp_mlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
+        memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+        pad[15] = (u8)tmp_mlen;                 // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (tmp_mlen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (tmp_mlen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)tmp_mlen; i++)
+            state[i] ^= m_auth[i];
+        state[15] ^= (u8)tmp_mlen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(tks.rtk);
+    tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+    skinny128_384(state, tks.rtk, state, tks.rtk1);
+    // ----------------- Process the associated data -----------------
+
+
+    // ----------------- Generate the tag -----------------
+    G(state,state);
+    memcpy(c + mlen, state, TAGBYTES);
+    // ----------------- Generate the tag -----------------
+
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (mlen > 0) {
+        SET_DOMAIN(tks, 0x24);
+        while (mlen > BLOCKBYTES) {
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        for(int i = 0; i < (int)mlen; i++) {
+            tmp = m[i];                     // Use of tmp variable in case c = m
+            c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= (u8)tmp;
+        }
+        state[15] ^= (u8)mlen;              // Padding
+    }
+
+    return 0;
+}
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp, tmp_mlen;
+    u8 final_domain = 0x30;
+    u8* m_auth = m;
+    const u8* c_tmp = c;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memcpy(state, c + clen, TAGBYTES);
+    tmp_mlen = clen;
+    if (tmp_mlen > 0) {
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        SET_DOMAIN(tks, 0x24);
+        while (tmp_mlen > BLOCKBYTES) {
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            RHO_INV(state, c, m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            tmp_mlen -= BLOCKBYTES;
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        for(int i = 0; i < (int)tmp_mlen; i++) {
+            m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= m[i];
+        }
+        state[15] ^= (u8)tmp_mlen;          // Padding
+    }
+    // ----------------- Process the ciphertext -----------------
+    
+    // ----------------- Process the associated data -----------------
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    memset(state, 0x00, BLOCKBYTES);
+    final_domain ^= final_ad_domain(adlen, clen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (clen >= BLOCKBYTES) {
+            tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            if (clen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            m_auth += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, clen);
+            memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+            pad[15] = (u8)clen;             // Padding
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            clen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (clen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        clen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (clen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (clen > BLOCKBYTES) {         // Last message double block is partial
+        clen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, clen);
+        memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+        pad[15] = (u8)clen;                 // Padding
+        tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else if (clen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (clen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)clen; i++)
+            state[i] ^= m[i];
+        state[15] ^= (u8)clen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(tks.rtk);
+    tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+    skinny128_384(state, tks.rtk, state, tks.rtk1);
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(int i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c_tmp[*mlen+i];   //constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk[4*SKINNY128_384_ROUNDS];    //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_128_ROUNDS	40
+#define SKINNY128_256_ROUNDS	48
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armcortexm/skinny128.s
+/*******************************************************************************
+* ARM assembly implementation of fixsliced SKINNY-128-384.
+*
+* For more details, see the paper at: https://
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+*******************************************************************************/
+
+.syntax unified
+.thumb
+
+/*******************************************************************************
+* applies P^2 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p2:
+	movw 	r1, #0xcc00
+	movt 	r1, #0xcc00 				//r1 <- 0xcc00cc00
+	movw 	r10, #0xcc00
+	movt 	r10, #0x0033 				//r10<- 0xcc000033
+	and 	r11, r1, r6, ror #14
+	bfi 	r11, r6, #16, #8
+	and 	r12, r6, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r6
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r6, #0x00cc0000
+	orr 	r6, r11, r12, lsr #18
+	and 	r11, r1, r7, ror #14
+	bfi 	r11, r7, #16, #8
+	and 	r12, r7, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r7
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r7, #0x00cc0000
+	orr 	r7, r11, r12, lsr #18
+	and 	r11, r1, r8, ror #14
+	bfi 	r11, r8, #16, #8
+	and 	r12, r8, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r8
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r8, #0x00cc0000
+	orr 	r8, r11, r12, lsr #18
+	and 	r11, r1, r9, ror #14
+	bfi 	r11, r9, #16, #8
+	and 	r12, r9, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r9
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r9, #0x00cc0000
+	orr 	r9, r11, r12, lsr #18
+	bx 		lr
+
+/*******************************************************************************
+* applies P^4 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p4:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0x00cc
+	movt 	r14, #0xcc00 				//r14<- 0xcc0000cc
+	movw 	r12, #0xcc00
+	movt 	r12, #0x3300 				//r12<- 0x3300cc00
+	movw 	r11, #0x00cc
+	movt 	r11, #0x00cc 				//r11<- 0x00cc00cc
+ 	and 	r10, r14, r6, ror #22
+ 	and 	r1, r12, r6, ror #16
+ 	orr 	r10, r10,  r1
+ 	and 	r1, r6, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r6, r6, r1
+ 	orr 	r6, r10, r6, ror #24
+ 	and 	r10, r14, r7, ror #22
+ 	and 	r1, r12, r7, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r7, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r7, r7, r1
+ 	orr 	r7, r10, r7, ror #24
+ 	and 	r10, r14, r8, ror #22
+ 	and 	r1, r12, r8, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r8, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r8, r8, r1
+ 	orr 	r8, r10, r8, ror #24
+ 	and 	r10, r14, r9, ror #22
+ 	ldr.w 	r14, [sp] 					//restore r14
+ 	and 	r12, r12, r9, ror #16
+ 	orr 	r10, r10, r12
+ 	and 	r12, r9, r11
+ 	orr 	r10, r10, r12, lsr #2
+	movw 	r12, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r9, r9, r12
+ 	orr 	r9, r10, r9, ror #24
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^6 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p6:
+	movw 	r1, #0x3333 				//r1 <- 0x00003333
+	movw 	r12, #0x00cc
+	movt 	r12, #0x3300 				//r12<- 0x330000cc
+	and 	r10, r6, r1, ror #8 		// --- permute r6 6 times
+	and 	r11, r12, r6, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r6, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r6, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r6, #0x00003300
+	orr 	r6, r11, r10, lsl #2 		// permute r6 6 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r7 6 times
+	and 	r11, r12, r7, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r7, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r7, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r7, #0x00003300
+	orr 	r7, r11, r10, lsl #2 		// permute r7 6 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r8 6 times
+	and 	r11, r12, r8, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r8, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r8, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r8, #0x00003300
+	orr 	r8, r11, r10, lsl #2 		// permute r8 6 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r9 6 times
+	and 	r11, r12, r9, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r9, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r9, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r9, #0x00003300 		// permute r9 6 times ---
+	orr 	r9, r11, r10, lsl #2
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^8 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p8:
+	movw 	r12, #0x3333 				//r12<- 0x00003333
+	movw 	r1, #0x0000
+	movt 	r1, #0x33cc 				//r1 <- 0x33cc0000
+	and 	r10, r6, r1 				// --- permute r6 8 times
+	and 	r11, r1, r6, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r6, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r6, r12, lsl #8
+	orr 	r6, r11, r10, lsr #6 		// permute r6 8 times ---
+	and 	r10, r7, r1 				// --- permute r7 8 times
+	and 	r11, r1, r7, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r7, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r7, r12, lsl #8
+	orr 	r7, r11, r10, lsr #6 		// permute r7 8 times ---
+	and 	r10, r8, r1 				// --- permute r8 8 times
+	and 	r11, r1, r8, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r8, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r8, r12, lsl #8
+	orr 	r8, r11, r10, lsr #6 		// permute r8 8 times ---
+	and 	r10, r9, r1 				// --- permute r9 8 times
+	and 	r11, r1, r9, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r9, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r9, r12, lsl #8
+	orr 	r9, r11, r10, lsr #6 		// permute r9 8 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^10 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p10:
+	movw 	r12, #0x0033
+	movt 	r12, #0x3300 				//r12<- 0x33000033
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+	and 	r10, r6, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r6, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r6, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r6, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r6, #0x0000cc00
+	orr 	r6, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r7, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r7, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r7, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r7, #0x0000cc00
+	orr 	r7, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r8, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r8, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r8, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r8, #0x0000cc00
+	orr 	r8, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r9, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r9, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r9, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r9, #0x0000cc00
+	orr 	r9, r11, r10, lsr #2 		// permute r6 10 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^12 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p12:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0xcc33 				//r14<- 0x0000cc33
+	movw 	r12, #0x00cc
+	movt 	r12, #0x00cc 				//r12<- 0x00cc00cc
+	movw 	r1, #0x3300
+	movt 	r1, #0xcc00 				//r1 <- 0xcc003300
+	and 	r10, r14, r6, ror #8 		// --- permute r6 12 times
+	and 	r11, r12, r6, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r6, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r6, r10, ror #8
+	orr 	r6, r11, r10, ror #10 		// permute r6 12 times ---
+	and 	r10, r14, r7, ror #8 		// --- permute r7 12 times
+	and 	r11, r12, r7, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r7, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r7, r10, ror #8
+	orr 	r7, r11, r10, ror #10 		// permute r7 12 times ---
+	and 	r10, r14, r8, ror #8 		// --- permute r8 12 times
+	and 	r11, r12, r8, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r8, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r8, r10, ror #8
+	orr 	r8, r11, r10, ror #10 		// permute r8 12 times ---
+	and 	r10, r14, r9, ror #8 		// --- permute r9 12 times
+	and 	r11, r12, r9, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r9, ror #16
+	ldr.w 	r14, [sp]
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r9, r10, ror #8
+	orr 	r9, r11, r10, ror #10 		// permute r9 12 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^14 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p14:
+	movw 	r1, #0xcc00
+	movt 	r1, #0x0033 				//r1 <- 0x0033cc00
+	movw 	r12, #0xcc00
+	movt 	r12, #0xcc00 				//r12<- 0x33003300
+	and 	r10, r1, r6, ror #24 		// --- permute r6 14 times
+	and 	r11, r6, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r6, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r6, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r6, r12
+	orr 	r6, r11, r10, ror #18 		// permute r6 14 times ---
+	and 	r10, r1, r7, ror #24 		// --- permute r7 14 times
+	and 	r11, r7, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r7, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r7, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r7, r12
+	orr 	r7, r11, r10, ror #18 		// permute r7 14 times ---
+	and 	r10, r1, r8, ror #24 		// --- permute r8 14 times
+	and 	r11, r8, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r8, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r8, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r8, r12
+	orr 	r8, r11, r10, ror #18 		// permute r8 14 times ---
+	and 	r10, r1, r9, ror #24 		// --- permute r9 14 times
+	and 	r11, r9, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r9, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r9, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r9, r12
+	orr 	r9, r11, r10, ror #18 		// permute r9 14 times ---
+ 	bx 		lr
+
+.align 2
+packing:
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r10
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 		//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r10
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r10
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 		//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r10
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 		//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r11
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 		//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r11, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 		//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r11, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 		//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r11, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 		//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r11, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 		//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r11, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 		//SWAPMOVE(r5, r3, 0x03030303, 2)
+	bx 		lr
+
+/******************************************************************************
+* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
+* Performing both at the same time allows to save some memory accesses.
+******************************************************************************/
+@ void 	tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
+.global tkschedule_lfsr
+.type   tkschedule_lfsr,%function
+.align	2
+tkschedule_lfsr:
+	push 	{r0-r12, r14}
+	ldr.w 	r3, [r1, #8] 				//load tk2 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk2 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk2 (4th word)
+	ldr.w 	r12, [r1] 					//load tk2 (1st word)
+	mov 	r1, r2 						//move tk3 address in r1
+	mov 	r2, r12 					//move 1st tk2 word in r2
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk2
+	mov 	r6, r2 						//move tk2 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk2 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk2 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk2 from r2-r5 to r6-r9
+	ldr.w 	r3, [r1, #8] 				//load tk3 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk3 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk3 (4th) word)
+	ldr.w 	r2, [r1] 					//load tk3 (1st) word)
+	bl 		packing 					//pack tk3
+	eor 	r10, r10, r10, lsl #4 		//r10<- 0xaaaaaaaa
+	ldr.w 	r1, [sp, #12] 				//load loop counter in r1
+	eor 	r11, r2, r6 				//tk2 ^ tk3 (1st word)
+	eor 	r12, r3, r7 				//tk2 ^ tk3 (2nd word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	eor 	r11, r4, r8 				//tk2 ^ tk3 (3rd word)
+	eor 	r12, r5, r9					//tk2 ^ tk3 (4th word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	loop:
+		and 	r12, r8, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r6
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r6, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r3, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r5, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r5, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r2, r8 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r4, r6				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r9, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r7
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r7, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r2, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r4, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r4, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r5, r9 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r3, r7				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r6, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r8
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r8, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r5, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r3, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r3, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r4, r6 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r2, r8				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r7, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r9
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r9, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r4, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r2, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r2, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r3, r7 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r5, r9				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		subs.w 	r1, r1, #8 				//decrease loop counter by 8
+		bne 	loop
+	pop 	{r0-r12, r14}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutation P and add the round constants to all round tweakeys.
+******************************************************************************/
+@ void 	tkschedule_perm(u32* tk)
+.global tkschedule_perm
+.type   tkschedule_perm,%function
+.align	2
+tkschedule_perm:
+	push 	{r0-r12, lr}
+	sub.w 	sp, #4 						//to store r14 in subroutines
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r6, r6, r10 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r7, r7, r10 				//tk &= 0xf0f0f0f0 (2nd word)
+	and 	r8, r8, r10 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r9, r9, r10 				//tk &= 0xf0f0f0f0 (4th word)
+	eor 	r8, r8, #0x00000004 		//add rconst
+	eor 	r9, r9, #0x00000040 		//add rconst
+	mvn 	r9, r9 						//to remove a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 1st round
+	strd 	r6, r7, [r0], #8  			//store 2nd half tk for 1st round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 3rd round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 3rd round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 5th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 5th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 4th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 4th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 7th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 7th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 9th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000054 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 9th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000410
+	eor 	r9, r9, #0x00000410
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 8th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 8th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x04000000 		//add rconst
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 11th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 11th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 13th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00140000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 13th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 12th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 12th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add const
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 15th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 15th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 17th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 17th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 16th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 16th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 18th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 18th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 19th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 19th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 21th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00100000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 21th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 20th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 20th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 22th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 22th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 23th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 23th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 25th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 25th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 24th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 24th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 26th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 26th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 27th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 27th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 29th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 29th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 28th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 28th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 30th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 30th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 31th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 31th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 33th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 33th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 32th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 32th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 34th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 34th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 35th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 35th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 37th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 37th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 36th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 36th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 38th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 38th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 39th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 39th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 41th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000010 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 41th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 40th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 40th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 42th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 42th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 43th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 43th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 45th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00040000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 45th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 44th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 44th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 46th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 46th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 47th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 47th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 49th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 49th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 48th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 48th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 50th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000140 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 50th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 51th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 51th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 53th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00040000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 53th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 52th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 52th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 54th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 54th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 55th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 55th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0x3030
+	movt 	r10, #0x3030 				//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 56th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 56th round
+	add.w 	sp, #4
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
+* don't need more calculations as no LFSR is applied to TK1.
+******************************************************************************/
+@ void 	tkschedule_perm_tk1(u32* tk, const u8* key)
+.global tkschedule_perm_tk1
+.type   tkschedule_perm_tk1,%function
+.align	2
+tkschedule_perm_tk1:
+	push 	{r0-r12, lr}
+	ldr.w 	r3, [r1, #8] 				//load tk1 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk1 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk1 (4th word)
+	ldr.w 	r2, [r1] 					//load tk1 (1st word)
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r6 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk1
+	mov 	r6, r2 						//move tk1 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk1 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk1 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk1 from r2-r5 to r6-r9
+	movw 	r2, #0xf0f0
+	movt 	r2, #0xf0f0 				//r2<- 0xf0f0f0f0
+	and 	r11, r8, r2 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r12, r9, r2 				//tk &= 0xf0f0f0f0 (4th word)
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 1st round
+	and 	r11, r6, r2 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r12, r7, r2 				//tk &= 0xf0f0f0f0 (2nd word)
+	strd 	r11, r12, [r0], #8  			//store 2nd half tk for 1st round
+
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0x0303
+	movt 	r3, #0x0303 				//r3<- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 4 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 6 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 8 times
+	and 	r11, r2, r6 				//ror and mask to match fixslicing
+	and 	r12, r2, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 9th round
+	and 	r11, r2, r8 				//ror and mask to match fixslicing
+	and 	r12, r2, r9 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 9th round
+	bl 		p2 							//apply the permutation 10
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 12 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 14 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0] 					//ror and masks to match fixslicing ---
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+.align 2
+quadruple_round:
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #30 	// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #16		// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #10		// --- mixcolumns 2 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #26
+	eor 	r2, r2, r8
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia 	r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #4		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	bx 		lr
+
+/******************************************************************************
+* Encrypt a single block using fixsliced SKINNY-128-128.
+******************************************************************************/
+@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
+.global skinny128_384
+.type   skinny128_384,%function
+.align 2
+skinny128_384:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	movw 	r7, #0x3030
+	movt 	r7, #0x3030 			//r7 <- 0x30303030
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r6
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r6
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 	//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r6
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r6
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r7
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r7, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r7, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r7, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r7, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r7, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	eor 	r10, r3, r5, lsr #2
+	and 	r10, r10, r7, lsr #4
+	eor 	r3, r3, r10
+	eor 	r5, r5, r10, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	eor 	r10, r4, r5, lsr #4
+	and 	r10, r10, r7, lsr #4
+	eor 	r4, r4, r10
+	eor 	r5, r5, r10, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r10, r4, r3, lsr #2
+	and 	r10, r10, r7, lsr #2
+	eor 	r4, r4, r10
+	eor 	r3, r3, r10, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r10, r2, r5, lsr #6
+	and 	r10, r10, r7, lsr #4
+	eor 	r2, r2, r10
+	eor 	r5, r5, r10, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r10, r2, r3, lsr #4
+	and 	r10, r10, r7, lsr #2
+	eor 	r2, r2, r10
+	eor 	r3, r3, r10, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r10, r2, r4, lsr #2
+	and 	r10, r10, r7
+	eor 	r2, r2, r10
+	eor 	r4, r4, r10, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r10, r5, r5, lsr #3
+	and 	r10, r10, r6
+	eor 	r5, r5, r10
+	eor 	r5, r5, r10, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r10, r4, r4, lsr #3
+	and 	r10, r10, r6
+	eor 	r4, r4, r10
+	eor 	r4, r4, r10, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r10, r3, r3, lsr #3
+	and 	r10, r10, r6
+	eor 	r3, r3, r10
+	eor 	r3, r3, r10, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r10, r2, r2, lsr #3
+	and 	r10, r10, r6
+	eor 	r2, r2, r10
+	eor 	r2, r2, r10, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr
+    
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc_NEC/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc_NEC/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/crypto_aead.h
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/genkat_aead.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/genkat_aead.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny.h
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule2.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule2.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule3.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule3.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_main.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_main.c
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/api.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/encrypt.c
+#include "skinny128.h"
+#include "tk_schedule.h"
+#include "romulus.h"
+#include <string.h>
+#include <stdio.h>
+
+static u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
+    u8 domain = 0;
+    u32 leftover;
+    //Determine which domain bits we need based on the length of the ad
+    if (adlen == 0) {
+        domain ^= 0x02;         // No message, so only 1 block with padding
+    } else {
+        leftover = (u32)(adlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd ad length?
+            domain ^= 0x08;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x02;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x0A;     // Even with a partial double block at the end
+        }
+    }
+    //Determine which domain bits we need based on the length of the message
+    if (mlen == 0) {
+        domain ^= 0x01;         // No message, so only 1 block with padding
+    } else {
+        leftover = (unsigned)(mlen % (2 * BLOCKBYTES));
+        if (leftover == 0) {    // Even or odd message length?
+            domain ^= 0x04;     // Even with a full double block at the end
+        } else if (leftover < BLOCKBYTES) {
+            domain ^= 0x01;     // Odd with a partial single block at the end
+        } else if (leftover > BLOCKBYTES) {
+            domain ^= 0x05;     // Even with a partial double block at the end
+        }
+    }
+    return domain;
+}
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    u64 tmp_mlen = mlen;
+    const u8* m_auth = m;
+    u8 final_domain = 0x30;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    final_domain ^= final_ad_domain(adlen, mlen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (tmp_mlen >= BLOCKBYTES) {
+            precompute_rtk2_3(tks.rtk2_3, m_auth, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            if (tmp_mlen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            tmp_mlen -= BLOCKBYTES;
+            m_auth += BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, tmp_mlen);
+            memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+            pad[15] = (u8)tmp_mlen;             // Padding
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            tmp_mlen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (tmp_mlen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        tmp_mlen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (tmp_mlen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (tmp_mlen > BLOCKBYTES) {         // Last message double block is partial
+        tmp_mlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
+        memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
+        pad[15] = (u8)tmp_mlen;                 // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (tmp_mlen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (tmp_mlen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)tmp_mlen; i++)
+            state[i] ^= m_auth[i];
+        state[15] ^= (u8)tmp_mlen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    precompute_rtk2_3(tks.rtk2_3, npub, k);
+    precompute_rtk1(tks.rtk1, tks.tk1);
+    skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+    // ----------------- Process the associated data -----------------
+
+
+    // ----------------- Generate the tag -----------------
+    G(state,state);
+    memcpy(c + mlen, state, TAGBYTES);
+    // ----------------- Generate the tag -----------------
+
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (mlen > 0) {
+        SET_DOMAIN(tks, 0x24);
+        while (mlen > BLOCKBYTES) {
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        for(int i = 0; i < (int)mlen; i++) {
+            tmp = m[i];                     // Use of tmp variable in case c = m
+            c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= (u8)tmp;
+        }
+        state[15] ^= (u8)mlen;              // Padding
+    }
+
+    return 0;
+}
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    u64 tmp_mlen;
+    u8 final_domain = 0x30;
+    u8* m_auth = m;
+    const u8* c_tmp = c;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                          // Init the 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memcpy(state, c + clen, TAGBYTES);
+    tmp_mlen = clen;
+    if (tmp_mlen > 0) {
+        SET_DOMAIN(tks, 0x24);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        while (tmp_mlen > BLOCKBYTES) {
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            RHO_INV(state, c, m);
+            UPDATE_CTR(tks.tk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            tmp_mlen -= BLOCKBYTES;
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        for(int i = 0; i < (int)tmp_mlen; i++) {
+            m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+            state[i] ^= m[i];
+        }
+        state[15] ^= (u8)tmp_mlen;          // Padding
+    }
+    // ----------------- Process the ciphertext -----------------
+    
+    // ----------------- Process the associated data -----------------
+    memset(tks.tk1, 0x00, KEYBYTES);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    memset(state, 0x00, BLOCKBYTES);
+    final_domain ^= final_ad_domain(adlen, clen);
+    SET_DOMAIN(tks, 0x28);
+    while (adlen > 2*BLOCKBYTES) {          // Process double blocks but the last
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+        ad += 2*BLOCKBYTES;
+        adlen -= 2*BLOCKBYTES;
+    }
+    // Pad and process the left-over blocks 
+    if (adlen == 2*BLOCKBYTES) {            // Left-over complete double block
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else if (adlen > BLOCKBYTES) {        // Left-over partial double block
+        adlen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, ad);
+        memcpy(pad, ad + BLOCKBYTES, adlen);
+        memset(pad + adlen, 0x00, 15 - adlen);
+        pad[15] = adlen;                    // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+    } else {
+        SET_DOMAIN(tks, 0x2C);
+        UPDATE_CTR(tks.tk1);
+        if (adlen == BLOCKBYTES) {          // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+        } else {                            // Left-over partial single block
+            for(int i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+        }
+        if (clen >= BLOCKBYTES) {
+            precompute_rtk2_3(tks.rtk2_3, m_auth, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            if (clen > BLOCKBYTES)
+                UPDATE_CTR(tks.tk1);
+            m_auth += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        } else {
+            memcpy(pad, m_auth, clen);
+            memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+            pad[15] = (u8)clen;             // Padding
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            clen = 0;
+        }
+    }
+    // Process all message double blocks except the last
+    SET_DOMAIN(tks, 0x2C);
+    while (clen > 32) {
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        UPDATE_CTR(tks.tk1);
+        m_auth += 2 * BLOCKBYTES;
+        clen -= 2 * BLOCKBYTES;
+    }
+    // Process the last message double block
+    if (clen == 2 * BLOCKBYTES) {             // Last message double block is full
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (clen > BLOCKBYTES) {         // Last message double block is partial
+        clen -= BLOCKBYTES;
+        UPDATE_CTR(tks.tk1);
+        XOR_BLOCK(state, state, m_auth);
+        memcpy(pad, m_auth + BLOCKBYTES, clen);
+        memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
+        pad[15] = (u8)clen;                 // Padding
+        precompute_rtk2_3(tks.rtk2_3, pad, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else if (clen == BLOCKBYTES) {        // Last message single block is full
+        XOR_BLOCK(state, state, m_auth);
+    } else if (clen > 0) {                  // Last message single block is partial
+        for(int i =0; i < (int)clen; i++)
+            state[i] ^= m[i];
+        state[15] ^= (u8)clen;              // Padding
+    }
+    // Process the last partial block
+    SET_DOMAIN(tks, final_domain);
+    UPDATE_CTR(tks.tk1);
+    precompute_rtk2_3(tks.rtk2_3, npub, k);
+    precompute_rtk1(tks.rtk1, tks.tk1);
+    skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(int i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c_tmp[*mlen+i];   //constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned int u64;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.c
+/******************************************************************************
+* Implementation of the SKINNY tweakey schedule to match fixslicing.
+* 
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h> 		//for memcmp
+#include "tk_schedule.h"
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/******************************************************************************
+* The round constants according to the new representation.
+******************************************************************************/
+u32 rconst_32_bs[224] = {
+	0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
+	0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
+	0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
+	0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
+	0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
+	0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
+	0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
+	0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
+	0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff, 
+	0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
+	0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
+	0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
+	0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
+	0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
+	0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
+	0x00000010, 0x00000000, 0x00010010, 0xfffffbff, 0x00000014, 0xffffffef,
+	0x00000000, 0x00000040, 0x00000100, 0x00000000, 0x10000040, 0xfffffeff,
+	0x44000000, 0xfffffeff, 0x00000000, 0x00000000, 0x00000000, 0x00100000,
+	0x04000001, 0xffffffff, 0x00040000, 0xffffffff, 0x00400000, 0x00000000,
+	0x00000000, 0x00000000, 0x00001000, 0xfebfffff, 0x01004400, 0xffffffff,
+	0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00010000, 0xffffffff,
+	0x00000004, 0xffffffbf, 0x00000040, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffebf, 0x44000100, 0xffffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00000001, 0xffffffff, 0x00040000, 0xffafffff,
+	0x00400000, 0x00000000, 0x00000000, 0x00000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfffffbff, 0x00000000, 0x00000400, 0x00000010, 0x00000000,
+	0x00010010, 0xffffffff
+};
+
+/******************************************************************************
+* 	Pack the input into the bitsliced representation
+* 	24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
+* 	25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
+* 	26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
+* 	27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
+******************************************************************************/
+void packing(u32* out, const u8* in) {
+	u32 tmp;
+	LE_LOAD(out, in);
+	LE_LOAD(out + 1, in + 8);
+	LE_LOAD(out + 2, in + 4);
+	LE_LOAD(out + 3, in + 12);
+	SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[3], out[0], 0x03030303, 6);
+	SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[3], out[2], 0x03030303, 4);
+	SWAPMOVE(out[3], out[1], 0x03030303, 2);
+}
+
+/******************************************************************************
+* Unpack the input to a byte-wise representation
+******************************************************************************/
+void unpacking(u8* out, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[3], in[1], 0x03030303, 2);
+	SWAPMOVE(in[3], in[2], 0x03030303, 4);
+	SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[3], in[0], 0x03030303, 6);
+	SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
+	LE_STORE(out, in[0]);
+	LE_STORE(out + 8, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out + 12, in[3]);
+}
+
+/******************************************************************************
+* 	0 4        1 5
+* 	1 5  --->  2 6
+* 	2 6        3 7
+* 	3 7        4 0
+******************************************************************************/
+void lfsr2_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[0] = tk[1];
+	tk[1] = tk[2];
+	tk[2] = tk[3];
+	tk[3] = tmp;
+}
+
+/******************************************************************************
+* 	0 4        7 3
+* 	1 5  --->  0 4
+* 	2 6        1 5
+* 	3 7        2 6
+******************************************************************************/
+void lfsr3_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[3] = tk[2];
+	tk[2] = tk[1];
+	tk[1] = tk[0];
+	tk[0] = tmp;
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, twice
+******************************************************************************/
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 4 times
+******************************************************************************/
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 6 times
+******************************************************************************/
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 8 times
+******************************************************************************/
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 10 times
+******************************************************************************/
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 12 times
+******************************************************************************/
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 14 times
+******************************************************************************/
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00;
+		tk[i] |= ROR(tmp,14) & 0x00cc0000;
+		tk[i] |= ROR(tmp,30) & 0xcc000000;
+		tk[i] |= ROR(tmp,16) & 0x000000ff;
+		tk[i] |= ROR(tmp,18) & 0x33003300;
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK2
+******************************************************************************/
+void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
+	u32 tk2[4];
+	packing(tk2, key);
+	memcpy(tk, tk2, 16);
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr2_bs(tk2);
+		memcpy(tk+i*4+4, tk2, 16);
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK3
+******************************************************************************/
+void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
+	u32 tk3[4];
+	packing(tk3, key);
+	tk[0] ^= tk3[0];
+	tk[1] ^= tk3[1];
+	tk[2] ^= tk3[2];
+	tk[3] ^= tk3[3];
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr3_bs(tk3);
+		tk[i*4+4] ^= tk3[0];
+		tk[i*4+5] ^= tk3[1];
+		tk[i*4+6] ^= tk3[2];
+		tk[i*4+7] ^= tk3[3];
+	}
+}
+
+/******************************************************************************
+* XOR TK with TK1 before applying the permutations.
+* The key is then rearranged to match the barrel shiftrows representation.
+******************************************************************************/
+void permute_tk(u32* tk, const u8* key, const int rounds) {
+	u32 test;
+	u32 tk1[4], tmp[4];
+	packing(tk1, key);
+	memcpy(tmp, tk, 16);
+	tmp[0] ^= tk1[0];
+	tmp[1] ^= tk1[1];
+	tmp[2] ^= tk1[2];
+	tmp[3] ^= tk1[3];
+	for(int i = 0 ; i < rounds; i += 8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		tk[i*4] = tmp[2] & 0xf0f0f0f0;
+		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
+		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
+		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+4, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 					// applies P^2
+		else
+			permute_tk_10(tmp); 				// applies P^10
+		tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
+		tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
+		tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+12, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 					// applies P^4
+		else
+			permute_tk_12(tmp); 				// applies P^12
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
+			tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
+		}
+		tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
+		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
+		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
+		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+20, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 					//	applies P^6
+		else
+			permute_tk_14(tmp); 				// applies P^14
+		tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
+		tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
+		tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
+		tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
+		tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
+		tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
+		tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+28, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 					// applies P^8
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
+			tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
+		}
+		if (test && (i+8 < rounds)) { 			//only if next loop iteration
+			tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
+			tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
+			tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
+			tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
+		}
+	}
+}
+
+/******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
+******************************************************************************/
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
+	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
+	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
+		for(int j = 0; j < 4; j++)
+			rtk[i*4+j] ^= rconst_32_bs[i*4+j];
+	}
+}
+
+/******************************************************************************
+* Precompute RTK1.
+******************************************************************************/
+void precompute_rtk1(u32* rtk1, const u8* tk1) {
+	memset(rtk1, 0x00, 16*16);
+	permute_tk(rtk1, tk1, 16);
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusm1v12/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/LWC_AEAD_KAT_128_128.txt
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/LWC_AEAD_KAT_128_128.txt
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/encrypt.c
+#include "skinny128.h"
+#include "romulus.h"
+#include <string.h>
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                       // Init 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    if (adlen == 0) {                       // Handle the special case of no AD
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process double blocks but the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        // Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {        // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {    // Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {   // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {                            // Left-over partial single block
+            for(int i = 0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+            SET_DOMAIN(tks, 0x1A);
+        }
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the plaintext -----------------
+    memset(tks.tk1, 0x00, KEYBYTES/2);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (mlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (mlen > BLOCKBYTES) {
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        // Pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (mlen < BLOCKBYTES) {            // Last message single block is full
+            for(int i = 0; i < (int)mlen; i++) {
+                tmp = m[i];                 // Use of tmp variable in case c = m
+                c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= (u8)tmp;
+            }
+            state[15] ^= (u8)mlen;          // Padding
+            SET_DOMAIN(tks, 0x15);
+        } else {                            // Last message single block is partial
+            RHO(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        c += mlen;
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate the tag -----------------
+    G(c,state);
+    // ----------------- Generate the tag -----------------
+
+    return 0;
+}
+
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                      // Init 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    if (adlen == 0) {                       // Handle the special case of no AD
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        // Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {        // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {    // Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {   // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {                            // Left-over partial single block
+            for(int i = 0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+            SET_DOMAIN(tks, 0x1A);
+        }
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES/2);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (clen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (clen > BLOCKBYTES) {
+            RHO_INV(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        }
+        // Pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (clen < BLOCKBYTES) {            // Last message double block is partial
+            for(int i = 0; i < (int)clen; i++) {
+                m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= m[i];
+            }
+            state[15] ^= (u8)clen;          // Padding
+            SET_DOMAIN(tks, 0x15);
+        } else {                            // Last message double block is full
+            RHO_INV(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(int i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c[clen+i];        // Constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                     //to manipulate tk1 in a byte-wise manner
+    u32 rtk1[32];                   //to avoid recomputation of the tk schedule
+    u32 rtk[4*SKINNY128_384_ROUNDS];//all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/armcortexm/skinny128.s
+/*******************************************************************************
+* ARM assembly implementation of fixsliced SKINNY-128-384.
+*
+* For more details, see the paper at: https://
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+*******************************************************************************/
+
+.syntax unified
+.thumb
+
+/*******************************************************************************
+* applies P^2 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p2:
+	movw 	r1, #0xcc00
+	movt 	r1, #0xcc00 				//r1 <- 0xcc00cc00
+	movw 	r10, #0xcc00
+	movt 	r10, #0x0033 				//r10<- 0xcc000033
+	and 	r11, r1, r6, ror #14
+	bfi 	r11, r6, #16, #8
+	and 	r12, r6, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r6
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r6, #0x00cc0000
+	orr 	r6, r11, r12, lsr #18
+	and 	r11, r1, r7, ror #14
+	bfi 	r11, r7, #16, #8
+	and 	r12, r7, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r7
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r7, #0x00cc0000
+	orr 	r7, r11, r12, lsr #18
+	and 	r11, r1, r8, ror #14
+	bfi 	r11, r8, #16, #8
+	and 	r12, r8, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r8
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r8, #0x00cc0000
+	orr 	r8, r11, r12, lsr #18
+	and 	r11, r1, r9, ror #14
+	bfi 	r11, r9, #16, #8
+	and 	r12, r9, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r9
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r9, #0x00cc0000
+	orr 	r9, r11, r12, lsr #18
+	bx 		lr
+
+/*******************************************************************************
+* applies P^4 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p4:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0x00cc
+	movt 	r14, #0xcc00 				//r14<- 0xcc0000cc
+	movw 	r12, #0xcc00
+	movt 	r12, #0x3300 				//r12<- 0x3300cc00
+	movw 	r11, #0x00cc
+	movt 	r11, #0x00cc 				//r11<- 0x00cc00cc
+ 	and 	r10, r14, r6, ror #22
+ 	and 	r1, r12, r6, ror #16
+ 	orr 	r10, r10,  r1
+ 	and 	r1, r6, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r6, r6, r1
+ 	orr 	r6, r10, r6, ror #24
+ 	and 	r10, r14, r7, ror #22
+ 	and 	r1, r12, r7, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r7, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r7, r7, r1
+ 	orr 	r7, r10, r7, ror #24
+ 	and 	r10, r14, r8, ror #22
+ 	and 	r1, r12, r8, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r8, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r8, r8, r1
+ 	orr 	r8, r10, r8, ror #24
+ 	and 	r10, r14, r9, ror #22
+ 	ldr.w 	r14, [sp] 					//restore r14
+ 	and 	r12, r12, r9, ror #16
+ 	orr 	r10, r10, r12
+ 	and 	r12, r9, r11
+ 	orr 	r10, r10, r12, lsr #2
+	movw 	r12, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r9, r9, r12
+ 	orr 	r9, r10, r9, ror #24
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^6 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p6:
+	movw 	r1, #0x3333 				//r1 <- 0x00003333
+	movw 	r12, #0x00cc
+	movt 	r12, #0x3300 				//r12<- 0x330000cc
+	and 	r10, r6, r1, ror #8 		// --- permute r6 6 times
+	and 	r11, r12, r6, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r6, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r6, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r6, #0x00003300
+	orr 	r6, r11, r10, lsl #2 		// permute r6 6 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r7 6 times
+	and 	r11, r12, r7, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r7, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r7, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r7, #0x00003300
+	orr 	r7, r11, r10, lsl #2 		// permute r7 6 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r8 6 times
+	and 	r11, r12, r8, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r8, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r8, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r8, #0x00003300
+	orr 	r8, r11, r10, lsl #2 		// permute r8 6 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r9 6 times
+	and 	r11, r12, r9, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r9, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r9, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r9, #0x00003300 		// permute r9 6 times ---
+	orr 	r9, r11, r10, lsl #2
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^8 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p8:
+	movw 	r12, #0x3333 				//r12<- 0x00003333
+	movw 	r1, #0x0000
+	movt 	r1, #0x33cc 				//r1 <- 0x33cc0000
+	and 	r10, r6, r1 				// --- permute r6 8 times
+	and 	r11, r1, r6, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r6, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r6, r12, lsl #8
+	orr 	r6, r11, r10, lsr #6 		// permute r6 8 times ---
+	and 	r10, r7, r1 				// --- permute r7 8 times
+	and 	r11, r1, r7, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r7, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r7, r12, lsl #8
+	orr 	r7, r11, r10, lsr #6 		// permute r7 8 times ---
+	and 	r10, r8, r1 				// --- permute r8 8 times
+	and 	r11, r1, r8, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r8, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r8, r12, lsl #8
+	orr 	r8, r11, r10, lsr #6 		// permute r8 8 times ---
+	and 	r10, r9, r1 				// --- permute r9 8 times
+	and 	r11, r1, r9, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r9, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r9, r12, lsl #8
+	orr 	r9, r11, r10, lsr #6 		// permute r9 8 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^10 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p10:
+	movw 	r12, #0x0033
+	movt 	r12, #0x3300 				//r12<- 0x33000033
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+	and 	r10, r6, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r6, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r6, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r6, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r6, #0x0000cc00
+	orr 	r6, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r7, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r7, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r7, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r7, #0x0000cc00
+	orr 	r7, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r8, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r8, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r8, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r8, #0x0000cc00
+	orr 	r8, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r9, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r9, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r9, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r9, #0x0000cc00
+	orr 	r9, r11, r10, lsr #2 		// permute r6 10 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^12 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p12:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0xcc33 				//r14<- 0x0000cc33
+	movw 	r12, #0x00cc
+	movt 	r12, #0x00cc 				//r12<- 0x00cc00cc
+	movw 	r1, #0x3300
+	movt 	r1, #0xcc00 				//r1 <- 0xcc003300
+	and 	r10, r14, r6, ror #8 		// --- permute r6 12 times
+	and 	r11, r12, r6, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r6, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r6, r10, ror #8
+	orr 	r6, r11, r10, ror #10 		// permute r6 12 times ---
+	and 	r10, r14, r7, ror #8 		// --- permute r7 12 times
+	and 	r11, r12, r7, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r7, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r7, r10, ror #8
+	orr 	r7, r11, r10, ror #10 		// permute r7 12 times ---
+	and 	r10, r14, r8, ror #8 		// --- permute r8 12 times
+	and 	r11, r12, r8, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r8, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r8, r10, ror #8
+	orr 	r8, r11, r10, ror #10 		// permute r8 12 times ---
+	and 	r10, r14, r9, ror #8 		// --- permute r9 12 times
+	and 	r11, r12, r9, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r9, ror #16
+	ldr.w 	r14, [sp]
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r9, r10, ror #8
+	orr 	r9, r11, r10, ror #10 		// permute r9 12 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^14 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p14:
+	movw 	r1, #0xcc00
+	movt 	r1, #0x0033 				//r1 <- 0x0033cc00
+	movw 	r12, #0xcc00
+	movt 	r12, #0xcc00 				//r12<- 0x33003300
+	and 	r10, r1, r6, ror #24 		// --- permute r6 14 times
+	and 	r11, r6, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r6, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r6, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r6, r12
+	orr 	r6, r11, r10, ror #18 		// permute r6 14 times ---
+	and 	r10, r1, r7, ror #24 		// --- permute r7 14 times
+	and 	r11, r7, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r7, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r7, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r7, r12
+	orr 	r7, r11, r10, ror #18 		// permute r7 14 times ---
+	and 	r10, r1, r8, ror #24 		// --- permute r8 14 times
+	and 	r11, r8, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r8, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r8, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r8, r12
+	orr 	r8, r11, r10, ror #18 		// permute r8 14 times ---
+	and 	r10, r1, r9, ror #24 		// --- permute r9 14 times
+	and 	r11, r9, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r9, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r9, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r9, r12
+	orr 	r9, r11, r10, ror #18 		// permute r9 14 times ---
+ 	bx 		lr
+
+.align 2
+packing:
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r10
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 		//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r10
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r10
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 		//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r10
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 		//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r11
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 		//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r11, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 		//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r11, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 		//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r11, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 		//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r11, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 		//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r11, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 		//SWAPMOVE(r5, r3, 0x03030303, 2)
+	bx 		lr
+
+/******************************************************************************
+* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
+* Performing both at the same time allows to save some memory accesses.
+******************************************************************************/
+@ void 	tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
+.global tkschedule_lfsr
+.type   tkschedule_lfsr,%function
+.align	2
+tkschedule_lfsr:
+	push 	{r0-r12, r14}
+	ldr.w 	r3, [r1, #8] 				//load tk2 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk2 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk2 (4th word)
+	ldr.w 	r12, [r1] 					//load tk2 (1st word)
+	mov 	r1, r2 						//move tk3 address in r1
+	mov 	r2, r12 					//move 1st tk2 word in r2
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk2
+	mov 	r6, r2 						//move tk2 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk2 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk2 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk2 from r2-r5 to r6-r9
+	ldr.w 	r3, [r1, #8] 				//load tk3 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk3 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk3 (4th) word)
+	ldr.w 	r2, [r1] 					//load tk3 (1st) word)
+	bl 		packing 					//pack tk3
+	eor 	r10, r10, r10, lsl #4 		//r10<- 0xaaaaaaaa
+	ldr.w 	r1, [sp, #12] 				//load loop counter in r1
+	eor 	r11, r2, r6 				//tk2 ^ tk3 (1st word)
+	eor 	r12, r3, r7 				//tk2 ^ tk3 (2nd word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	eor 	r11, r4, r8 				//tk2 ^ tk3 (3rd word)
+	eor 	r12, r5, r9					//tk2 ^ tk3 (4th word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	loop:
+		and 	r12, r8, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r6
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r6, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r3, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r5, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r5, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r2, r8 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r4, r6				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r9, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r7
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r7, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r2, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r4, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r4, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r5, r9 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r3, r7				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r6, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r8
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r8, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r5, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r3, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r3, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r4, r6 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r2, r8				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r7, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r9
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r9, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r4, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r2, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r2, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r3, r7 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r5, r9				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		subs.w 	r1, r1, #8 				//decrease loop counter by 8
+		bne 	loop
+	pop 	{r0-r12, r14}
+	bx 		lr
+
+@ void 	tkschedule_perm(u32* tk)
+.global tkschedule_perm
+.type   tkschedule_perm,%function
+.align	2
+tkschedule_perm:
+	push 	{r0-r12, lr}
+	sub.w 	sp, #4 						//to store r14 in subroutines
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r6, r6, r10 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r7, r7, r10 				//tk &= 0xf0f0f0f0 (2nd word)
+	and 	r8, r8, r10 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r9, r9, r10 				//tk &= 0xf0f0f0f0 (4th word)
+	eor 	r8, r8, #0x00000004 		//add rconst
+	eor 	r9, r9, #0x00000040 		//add rconst
+	mvn 	r9, r9 						//to remove a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 1st round
+	strd 	r6, r7, [r0], #8  			//store 2nd half tk for 1st round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 3rd round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 3rd round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 5th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 5th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 4th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 4th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 7th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 7th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 9th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000054 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 9th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000410
+	eor 	r9, r9, #0x00000410
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 8th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 8th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x04000000 		//add rconst
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 11th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 11th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 13th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00140000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 13th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 12th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 12th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add const
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 15th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 15th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 17th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 17th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 16th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 16th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 18th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 18th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 19th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 19th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 21th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00100000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 21th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 20th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 20th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 22th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 22th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 23th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 23th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 25th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 25th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 24th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 24th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 26th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 26th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 27th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 27th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 29th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 29th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 28th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 28th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 30th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 30th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 31th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 31th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 33th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 33th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 32th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 32th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 34th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 34th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 35th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 35th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 37th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 37th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 36th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 36th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 38th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 38th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 39th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 39th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0x3030
+	movt 	r10, #0x3030 				//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 40th round
+	strd 	r8, r9, [r0] 			//store 2nd half tk for 40th round
+	add.w 	sp, #4
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
+* don't need more calculations as no LFSR is applied to TK1.
+******************************************************************************/
+@ void 	tkschedule_perm_tk1(u32* tk, const u8* key)
+.global tkschedule_perm_tk1
+.type   tkschedule_perm_tk1,%function
+.align	2
+tkschedule_perm_tk1:
+	push 	{r0-r12, lr}
+	ldr.w 	r3, [r1, #8] 				//load tk1 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk1 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk1 (4th word)
+	ldr.w 	r2, [r1] 					//load tk1 (1st word)
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r6 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk1
+	mov 	r6, r2 						//move tk1 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk1 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk1 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk1 from r2-r5 to r6-r9
+	movw 	r2, #0xf0f0
+	movt 	r2, #0xf0f0 				//r2<- 0xf0f0f0f0
+	and 	r11, r8, r2 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r12, r9, r2 				//tk &= 0xf0f0f0f0 (4th word)
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 1st round
+	and 	r11, r6, r2 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r12, r7, r2 				//tk &= 0xf0f0f0f0 (2nd word)
+	strd 	r11, r12, [r0], #8  			//store 2nd half tk for 1st round
+
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0x0303
+	movt 	r3, #0x0303 				//r3<- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 4 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 6 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 8 times
+	and 	r11, r2, r6 				//ror and mask to match fixslicing
+	and 	r12, r2, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 9th round
+	and 	r11, r2, r8 				//ror and mask to match fixslicing
+	and 	r12, r2, r9 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 9th round
+	bl 		p2 							//apply the permutation 10
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 12 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 14 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0] 					//ror and masks to match fixslicing ---
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+.align 2
+quadruple_round:
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #30 	// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #16		// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #10		// --- mixcolumns 2 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #26
+	eor 	r2, r2, r8
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia 	r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #4		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	bx 		lr
+
+/******************************************************************************
+* Encrypt a single block using fixsliced SKINNY-128-128.
+******************************************************************************/
+@ void 	skinny128_384(u8* ctext, const u32* tk, const u8* ptext)
+.global skinny128_384
+.type   skinny128_384,%function
+.align 2
+skinny128_384:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	movw 	r7, #0x3030
+	movt 	r7, #0x3030 			//r7 <- 0x30303030
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r6
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r6
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 	//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r6
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r6
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r7
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r7, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r7, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r7, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r7, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r7, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	eor 	r10, r3, r5, lsr #2
+	and 	r10, r10, r7, lsr #4
+	eor 	r3, r3, r10
+	eor 	r5, r5, r10, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	eor 	r10, r4, r5, lsr #4
+	and 	r10, r10, r7, lsr #4
+	eor 	r4, r4, r10
+	eor 	r5, r5, r10, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r10, r4, r3, lsr #2
+	and 	r10, r10, r7, lsr #2
+	eor 	r4, r4, r10
+	eor 	r3, r3, r10, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r10, r2, r5, lsr #6
+	and 	r10, r10, r7, lsr #4
+	eor 	r2, r2, r10
+	eor 	r5, r5, r10, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r10, r2, r3, lsr #4
+	and 	r10, r10, r7, lsr #2
+	eor 	r2, r2, r10
+	eor 	r3, r3, r10, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r10, r2, r4, lsr #2
+	and 	r10, r10, r7
+	eor 	r2, r2, r10
+	eor 	r4, r4, r10, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r10, r5, r5, lsr #3
+	and 	r10, r10, r6
+	eor 	r5, r5, r10
+	eor 	r5, r5, r10, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r10, r4, r4, lsr #3
+	and 	r10, r10, r6
+	eor 	r4, r4, r10
+	eor 	r4, r4, r10, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r10, r3, r3, lsr #3
+	and 	r10, r10, r6
+	eor 	r3, r3, r10
+	eor 	r3, r3, r10, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r10, r2, r2, lsr #3
+	and 	r10, r10, r6
+	eor 	r2, r2, r10
+	eor 	r2, r2, r10, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr
+    
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/encrypt.c
+#include "skinny128.h"
+#include "tk_schedule.h"
+#include "romulus.h"
+#include <string.h>
+#include <stdio.h>
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    int i;
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                          //56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    //Handle the special case of no associated data
+    if (adlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3); 
+    } else {
+        // Process all double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        //Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {
+            // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {
+            //  Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {
+            //  Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {
+            // Left-over partial single block
+            for(i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;
+            SET_DOMAIN(tks, 0x1A);
+        }
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the plaintext -----------------
+    memset(tks.tk1, 0, KEYBYTES);
+    tks.tk1[0] = 0x01;          //init the 56-bit LFSR counter
+    if (mlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        //process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (mlen > BLOCKBYTES) {
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        //pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (mlen < BLOCKBYTES) {
+            for(i = 0; i < (int)mlen; i++) {
+                tmp = m[i];         //use of tmp variable just in case 'c = m'
+                c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= (u8)tmp;
+            }
+            state[15] ^= (u8)mlen; //padding
+            SET_DOMAIN(tks, 0x15);
+        } else {
+            RHO(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+        c += mlen;
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate the tag -----------------
+    G(state, state);
+    memcpy(c, state, TAGBYTES);
+    // ----------------- Generate the tag -----------------
+
+    return 0;
+}
+
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    int i;
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                          //56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    //Handle the special case of no associated data
+    if (adlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        // Process all double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        //Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {
+            // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {
+            //  Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);  
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {
+            //  Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {
+            // Left-over partial single block
+            for(i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;
+            SET_DOMAIN(tks, 0x1A);
+        }
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memset(tks.tk1, 0, KEYBYTES);
+    tks.tk1[0] = 0x01;          //init the 56-bit LFSR counter
+    if (clen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        //process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (clen > BLOCKBYTES) {
+            RHO_INV(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        }
+        //pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (clen < BLOCKBYTES) {
+            for(i = 0; i < (int)clen; i++) {
+                m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= m[i];
+            }
+            state[15] ^= (u8)clen; //padding
+            SET_DOMAIN(tks, 0x15);
+        } else {
+            RHO_INV(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c[clen+i];   //constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,  
+					const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/tk_schedule.c
+/******************************************************************************
+* Implementation of the SKINNY tweakey schedule to match fixslicing.
+* 
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h> 		//for memcmp
+#include "tk_schedule.h"
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/******************************************************************************
+* The round constants according to the new representation.
+******************************************************************************/
+u32 rconst_32_bs[160] = {
+	0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
+	0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
+	0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
+	0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
+	0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
+	0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
+	0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
+	0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
+	0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff, 
+	0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
+	0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
+	0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
+	0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
+	0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
+	0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
+	0x00000010, 0x00000000, 0x00010010, 0xfffffbff
+};
+
+/******************************************************************************
+* 	Pack the input into the bitsliced representation
+* 	24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
+* 	25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
+* 	26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
+* 	27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
+******************************************************************************/
+void packing(u32* out, const u8* in) {
+	u32 tmp;
+	LE_LOAD(out, in);
+	LE_LOAD(out + 1, in + 8);
+	LE_LOAD(out + 2, in + 4);
+	LE_LOAD(out + 3, in + 12);
+	SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[3], out[0], 0x03030303, 6);
+	SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[3], out[2], 0x03030303, 4);
+	SWAPMOVE(out[3], out[1], 0x03030303, 2);
+}
+
+/******************************************************************************
+* Unpack the input to a byte-wise representation
+******************************************************************************/
+void unpacking(u8* out, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[3], in[1], 0x03030303, 2);
+	SWAPMOVE(in[3], in[2], 0x03030303, 4);
+	SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[3], in[0], 0x03030303, 6);
+	SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
+	LE_STORE(out, in[0]);
+	LE_STORE(out + 8, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out + 12, in[3]);
+}
+
+/******************************************************************************
+* 	0 4        1 5
+* 	1 5  --->  2 6
+* 	2 6        3 7
+* 	3 7        4 0
+******************************************************************************/
+void lfsr2_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[0] = tk[1];
+	tk[1] = tk[2];
+	tk[2] = tk[3];
+	tk[3] = tmp;
+}
+
+/******************************************************************************
+* 	0 4        7 3
+* 	1 5  --->  0 4
+* 	2 6        1 5
+* 	3 7        2 6
+******************************************************************************/
+void lfsr3_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[3] = tk[2];
+	tk[2] = tk[1];
+	tk[1] = tk[0];
+	tk[0] = tmp;
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, twice
+******************************************************************************/
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 4 times
+******************************************************************************/
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 6 times
+******************************************************************************/
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 8 times
+******************************************************************************/
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 10 times
+******************************************************************************/
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 12 times
+******************************************************************************/
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 14 times
+******************************************************************************/
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00;
+		tk[i] |= ROR(tmp,14) & 0x00cc0000;
+		tk[i] |= ROR(tmp,30) & 0xcc000000;
+		tk[i] |= ROR(tmp,16) & 0x000000ff;
+		tk[i] |= ROR(tmp,18) & 0x33003300;
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK2
+******************************************************************************/
+void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
+	u32 tk2[4];
+	packing(tk2, key);
+	memcpy(tk, tk2, 16);
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr2_bs(tk2);
+		memcpy(tk+i*4+4, tk2, 16);
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK3
+******************************************************************************/
+void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
+	u32 tk3[4];
+	packing(tk3, key);
+	tk[0] ^= tk3[0];
+	tk[1] ^= tk3[1];
+	tk[2] ^= tk3[2];
+	tk[3] ^= tk3[3];
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr3_bs(tk3);
+		tk[i*4+4] ^= tk3[0];
+		tk[i*4+5] ^= tk3[1];
+		tk[i*4+6] ^= tk3[2];
+		tk[i*4+7] ^= tk3[3];
+	}
+}
+
+/******************************************************************************
+* XOR TK with TK1 before applying the permutations.
+* The key is then rearranged to match the barrel shiftrows representation.
+******************************************************************************/
+void permute_tk(u32* tk, const u8* key, const int rounds) {
+	u32 test;
+	u32 tk1[4], tmp[4];
+	packing(tk1, key);
+	memcpy(tmp, tk, 16);
+	tmp[0] ^= tk1[0];
+	tmp[1] ^= tk1[1];
+	tmp[2] ^= tk1[2];
+	tmp[3] ^= tk1[3];
+	for(int i = 0 ; i < rounds; i += 8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		tk[i*4] = tmp[2] & 0xf0f0f0f0;
+		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
+		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
+		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+4, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 					// applies P^2
+		else
+			permute_tk_10(tmp); 				// applies P^10
+		tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
+		tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
+		tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+12, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 					// applies P^4
+		else
+			permute_tk_12(tmp); 				// applies P^12
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
+			tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
+		}
+		tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
+		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
+		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
+		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+20, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 					//	applies P^6
+		else
+			permute_tk_14(tmp); 				// applies P^14
+		tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
+		tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
+		tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
+		tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
+		tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
+		tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
+		tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+28, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 					// applies P^8
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
+			tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
+		}
+		if (test && (i+8 < rounds)) { 			//only if next loop iteration
+			tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
+			tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
+			tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
+			tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
+		}
+	}
+}
+
+/******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
+******************************************************************************/
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
+	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
+	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
+		for(int j = 0; j < 4; j++)
+			rtk[i*4+j] ^= rconst_32_bs[i*4+j];
+	}
+}
+
+/******************************************************************************
+* Precompute RTK1.
+******************************************************************************/
+void precompute_rtk1(u32* rtk1, const u8* tk1) {
+	memset(rtk1, 0x00, 16*16);
+	permute_tk(rtk1, tk1, 16);
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusn1+v13/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/encrypt.c
+#include "skinny128.h"
+#include "romulus.h"
+#include <string.h>
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                       // Init 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    if (adlen == 0) {                       // Handle the special case of no AD
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process double blocks but the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        // Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {        // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {    // Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {   // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {                            // Left-over partial single block
+            for(int i = 0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+            SET_DOMAIN(tks, 0x1A);
+        }
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the plaintext -----------------
+    memset(tks.tk1, 0x00, KEYBYTES/2);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (mlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (mlen > BLOCKBYTES) {
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        // Pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (mlen < BLOCKBYTES) {            // Last message single block is full
+            for(int i = 0; i < (int)mlen; i++) {
+                tmp = m[i];                 // Use of tmp variable in case c = m
+                c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= (u8)tmp;
+            }
+            state[15] ^= (u8)mlen;          // Padding
+            SET_DOMAIN(tks, 0x15);
+        } else {                            // Last message single block is partial
+            RHO(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+        c += mlen;
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate the tag -----------------
+    G(c,state);
+    // ----------------- Generate the tag -----------------
+
+    return 0;
+}
+
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                      // Init 56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    if (adlen == 0) {                       // Handle the special case of no AD
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        // Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {        // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {    // Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
+            tkschedule_perm(tks.rtk);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {   // Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {                            // Left-over partial single block
+            for(int i = 0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;             // Padding
+            SET_DOMAIN(tks, 0x1A);
+        }
+        tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
+        tkschedule_perm(tks.rtk);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES/2);
+    tks.tk1[0] = 0x01;                      // Init the 56-bit LFSR counter
+    if (clen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    } else {                                // Process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (clen > BLOCKBYTES) {
+            RHO_INV(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, tks.rtk, state, tks.rtk1);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        }
+        // Pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (clen < BLOCKBYTES) {            // Last message double block is partial
+            for(int i = 0; i < (int)clen; i++) {
+                m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= m[i];
+            }
+            state[15] ^= (u8)clen;          // Padding
+            SET_DOMAIN(tks, 0x15);
+        } else {                            // Last message double block is full
+            RHO_INV(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        tkschedule_perm_tk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, tks.rtk, state, tks.rtk1);
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(int i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c[clen+i];        // Constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                     //to manipulate tk1 in a byte-wise manner
+    u32 rtk1[32];                   //to avoid recomputation of the tk schedule
+    u32 rtk[4*SKINNY128_384_ROUNDS];//all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_128_ROUNDS	40
+#define SKINNY128_256_ROUNDS	48
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.s
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armcortexm/skinny128.s
+/*******************************************************************************
+* ARM assembly implementation of fixsliced SKINNY-128-384.
+*
+* For more details, see the paper at: https://
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+*******************************************************************************/
+
+.syntax unified
+.thumb
+
+/*******************************************************************************
+* applies P^2 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p2:
+	movw 	r1, #0xcc00
+	movt 	r1, #0xcc00 				//r1 <- 0xcc00cc00
+	movw 	r10, #0xcc00
+	movt 	r10, #0x0033 				//r10<- 0xcc000033
+	and 	r11, r1, r6, ror #14
+	bfi 	r11, r6, #16, #8
+	and 	r12, r6, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r6
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r6, #0x00cc0000
+	orr 	r6, r11, r12, lsr #18
+	and 	r11, r1, r7, ror #14
+	bfi 	r11, r7, #16, #8
+	and 	r12, r7, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r7
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r7, #0x00cc0000
+	orr 	r7, r11, r12, lsr #18
+	and 	r11, r1, r8, ror #14
+	bfi 	r11, r8, #16, #8
+	and 	r12, r8, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r8
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r8, #0x00cc0000
+	orr 	r8, r11, r12, lsr #18
+	and 	r11, r1, r9, ror #14
+	bfi 	r11, r9, #16, #8
+	and 	r12, r9, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r9
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r9, #0x00cc0000
+	orr 	r9, r11, r12, lsr #18
+	bx 		lr
+
+/*******************************************************************************
+* applies P^4 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p4:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0x00cc
+	movt 	r14, #0xcc00 				//r14<- 0xcc0000cc
+	movw 	r12, #0xcc00
+	movt 	r12, #0x3300 				//r12<- 0x3300cc00
+	movw 	r11, #0x00cc
+	movt 	r11, #0x00cc 				//r11<- 0x00cc00cc
+ 	and 	r10, r14, r6, ror #22
+ 	and 	r1, r12, r6, ror #16
+ 	orr 	r10, r10,  r1
+ 	and 	r1, r6, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r6, r6, r1
+ 	orr 	r6, r10, r6, ror #24
+ 	and 	r10, r14, r7, ror #22
+ 	and 	r1, r12, r7, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r7, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r7, r7, r1
+ 	orr 	r7, r10, r7, ror #24
+ 	and 	r10, r14, r8, ror #22
+ 	and 	r1, r12, r8, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r8, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r8, r8, r1
+ 	orr 	r8, r10, r8, ror #24
+ 	and 	r10, r14, r9, ror #22
+ 	ldr.w 	r14, [sp] 					//restore r14
+ 	and 	r12, r12, r9, ror #16
+ 	orr 	r10, r10, r12
+ 	and 	r12, r9, r11
+ 	orr 	r10, r10, r12, lsr #2
+	movw 	r12, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r9, r9, r12
+ 	orr 	r9, r10, r9, ror #24
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^6 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p6:
+	movw 	r1, #0x3333 				//r1 <- 0x00003333
+	movw 	r12, #0x00cc
+	movt 	r12, #0x3300 				//r12<- 0x330000cc
+	and 	r10, r6, r1, ror #8 		// --- permute r6 6 times
+	and 	r11, r12, r6, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r6, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r6, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r6, #0x00003300
+	orr 	r6, r11, r10, lsl #2 		// permute r6 6 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r7 6 times
+	and 	r11, r12, r7, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r7, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r7, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r7, #0x00003300
+	orr 	r7, r11, r10, lsl #2 		// permute r7 6 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r8 6 times
+	and 	r11, r12, r8, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r8, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r8, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r8, #0x00003300
+	orr 	r8, r11, r10, lsl #2 		// permute r8 6 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r9 6 times
+	and 	r11, r12, r9, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r9, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r9, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r9, #0x00003300 		// permute r9 6 times ---
+	orr 	r9, r11, r10, lsl #2
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^8 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p8:
+	movw 	r12, #0x3333 				//r12<- 0x00003333
+	movw 	r1, #0x0000
+	movt 	r1, #0x33cc 				//r1 <- 0x33cc0000
+	and 	r10, r6, r1 				// --- permute r6 8 times
+	and 	r11, r1, r6, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r6, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r6, r12, lsl #8
+	orr 	r6, r11, r10, lsr #6 		// permute r6 8 times ---
+	and 	r10, r7, r1 				// --- permute r7 8 times
+	and 	r11, r1, r7, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r7, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r7, r12, lsl #8
+	orr 	r7, r11, r10, lsr #6 		// permute r7 8 times ---
+	and 	r10, r8, r1 				// --- permute r8 8 times
+	and 	r11, r1, r8, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r8, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r8, r12, lsl #8
+	orr 	r8, r11, r10, lsr #6 		// permute r8 8 times ---
+	and 	r10, r9, r1 				// --- permute r9 8 times
+	and 	r11, r1, r9, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r9, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r9, r12, lsl #8
+	orr 	r9, r11, r10, lsr #6 		// permute r9 8 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^10 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p10:
+	movw 	r12, #0x0033
+	movt 	r12, #0x3300 				//r12<- 0x33000033
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+	and 	r10, r6, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r6, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r6, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r6, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r6, #0x0000cc00
+	orr 	r6, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r7, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r7, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r7, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r7, #0x0000cc00
+	orr 	r7, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r8, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r8, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r8, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r8, #0x0000cc00
+	orr 	r8, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r9, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r9, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r9, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r9, #0x0000cc00
+	orr 	r9, r11, r10, lsr #2 		// permute r6 10 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^12 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p12:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0xcc33 				//r14<- 0x0000cc33
+	movw 	r12, #0x00cc
+	movt 	r12, #0x00cc 				//r12<- 0x00cc00cc
+	movw 	r1, #0x3300
+	movt 	r1, #0xcc00 				//r1 <- 0xcc003300
+	and 	r10, r14, r6, ror #8 		// --- permute r6 12 times
+	and 	r11, r12, r6, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r6, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r6, r10, ror #8
+	orr 	r6, r11, r10, ror #10 		// permute r6 12 times ---
+	and 	r10, r14, r7, ror #8 		// --- permute r7 12 times
+	and 	r11, r12, r7, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r7, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r7, r10, ror #8
+	orr 	r7, r11, r10, ror #10 		// permute r7 12 times ---
+	and 	r10, r14, r8, ror #8 		// --- permute r8 12 times
+	and 	r11, r12, r8, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r8, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r8, r10, ror #8
+	orr 	r8, r11, r10, ror #10 		// permute r8 12 times ---
+	and 	r10, r14, r9, ror #8 		// --- permute r9 12 times
+	and 	r11, r12, r9, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r9, ror #16
+	ldr.w 	r14, [sp]
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r9, r10, ror #8
+	orr 	r9, r11, r10, ror #10 		// permute r9 12 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^14 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p14:
+	movw 	r1, #0xcc00
+	movt 	r1, #0x0033 				//r1 <- 0x0033cc00
+	movw 	r12, #0xcc00
+	movt 	r12, #0xcc00 				//r12<- 0x33003300
+	and 	r10, r1, r6, ror #24 		// --- permute r6 14 times
+	and 	r11, r6, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r6, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r6, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r6, r12
+	orr 	r6, r11, r10, ror #18 		// permute r6 14 times ---
+	and 	r10, r1, r7, ror #24 		// --- permute r7 14 times
+	and 	r11, r7, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r7, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r7, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r7, r12
+	orr 	r7, r11, r10, ror #18 		// permute r7 14 times ---
+	and 	r10, r1, r8, ror #24 		// --- permute r8 14 times
+	and 	r11, r8, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r8, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r8, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r8, r12
+	orr 	r8, r11, r10, ror #18 		// permute r8 14 times ---
+	and 	r10, r1, r9, ror #24 		// --- permute r9 14 times
+	and 	r11, r9, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r9, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r9, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r9, r12
+	orr 	r9, r11, r10, ror #18 		// permute r9 14 times ---
+ 	bx 		lr
+
+.align 2
+packing:
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r10
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 		//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r10
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r10
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 		//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r10
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 		//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r11
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 		//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r11, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 		//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r11, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 		//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r11, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 		//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r11, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 		//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r11, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 		//SWAPMOVE(r5, r3, 0x03030303, 2)
+	bx 		lr
+
+/******************************************************************************
+* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
+* Performing both at the same time allows to save some memory accesses.
+******************************************************************************/
+@ void 	tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
+.global tkschedule_lfsr
+.type   tkschedule_lfsr,%function
+.align	2
+tkschedule_lfsr:
+	push 	{r0-r12, r14}
+	ldr.w 	r3, [r1, #8] 				//load tk2 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk2 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk2 (4th word)
+	ldr.w 	r12, [r1] 					//load tk2 (1st word)
+	mov 	r1, r2 						//move tk3 address in r1
+	mov 	r2, r12 					//move 1st tk2 word in r2
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk2
+	mov 	r6, r2 						//move tk2 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk2 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk2 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk2 from r2-r5 to r6-r9
+	ldr.w 	r3, [r1, #8] 				//load tk3 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk3 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk3 (4th) word)
+	ldr.w 	r2, [r1] 					//load tk3 (1st) word)
+	bl 		packing 					//pack tk3
+	eor 	r10, r10, r10, lsl #4 		//r10<- 0xaaaaaaaa
+	ldr.w 	r1, [sp, #12] 				//load loop counter in r1
+	eor 	r11, r2, r6 				//tk2 ^ tk3 (1st word)
+	eor 	r12, r3, r7 				//tk2 ^ tk3 (2nd word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	eor 	r11, r4, r8 				//tk2 ^ tk3 (3rd word)
+	eor 	r12, r5, r9					//tk2 ^ tk3 (4th word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	loop:
+		and 	r12, r8, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r6
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r6, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r3, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r5, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r5, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r2, r8 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r4, r6				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r9, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r7
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r7, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r2, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r4, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r4, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r5, r9 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r3, r7				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r6, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r8
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r8, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r5, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r3, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r3, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r4, r6 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r2, r8				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r7, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r9
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r9, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r4, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r2, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r2, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r3, r7 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r5, r9				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		subs.w 	r1, r1, #8 				//decrease loop counter by 8
+		bne 	loop
+	pop 	{r0-r12, r14}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutation P and add the round constants to all round tweakeys.
+******************************************************************************/
+@ void 	tkschedule_perm(u32* tk)
+.global tkschedule_perm
+.type   tkschedule_perm,%function
+.align	2
+tkschedule_perm:
+	push 	{r0-r12, lr}
+	sub.w 	sp, #4 						//to store r14 in subroutines
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r6, r6, r10 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r7, r7, r10 				//tk &= 0xf0f0f0f0 (2nd word)
+	and 	r8, r8, r10 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r9, r9, r10 				//tk &= 0xf0f0f0f0 (4th word)
+	eor 	r8, r8, #0x00000004 		//add rconst
+	eor 	r9, r9, #0x00000040 		//add rconst
+	mvn 	r9, r9 						//to remove a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 1st round
+	strd 	r6, r7, [r0], #8  			//store 2nd half tk for 1st round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 3rd round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 3rd round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 5th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 5th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 4th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 4th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 7th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 7th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 9th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000054 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 9th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000410
+	eor 	r9, r9, #0x00000410
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 8th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 8th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x04000000 		//add rconst
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 11th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 11th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 13th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00140000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 13th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 12th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 12th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add const
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 15th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 15th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 17th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 17th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 16th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 16th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 18th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 18th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 19th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 19th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 21th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00100000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 21th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 20th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 20th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 22th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 22th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 23th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 23th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 25th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 25th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 24th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 24th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 26th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 26th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 27th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 27th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 29th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 29th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 28th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 28th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 30th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 30th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 31th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 31th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 33th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 33th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 32th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 32th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 34th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 34th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 35th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 35th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 37th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 37th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 36th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 36th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 38th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 38th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 39th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 39th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 41th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000010 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 41th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 40th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 40th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 42th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 42th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 43th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 43th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 45th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00040000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 45th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 44th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 44th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 46th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 46th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 47th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 47th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 49th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 49th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 48th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 48th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 50th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000140 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 50th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 51th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 51th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 53th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00040000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 53th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 52th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 52th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 54th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 54th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 55th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 55th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0x3030
+	movt 	r10, #0x3030 				//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 56th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 56th round
+	add.w 	sp, #4
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
+* don't need more calculations as no LFSR is applied to TK1.
+******************************************************************************/
+@ void 	tkschedule_perm_tk1(u32* tk, const u8* key)
+.global tkschedule_perm_tk1
+.type   tkschedule_perm_tk1,%function
+.align	2
+tkschedule_perm_tk1:
+	push 	{r0-r12, lr}
+	ldr.w 	r3, [r1, #8] 				//load tk1 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk1 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk1 (4th word)
+	ldr.w 	r2, [r1] 					//load tk1 (1st word)
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r6 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk1
+	mov 	r6, r2 						//move tk1 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk1 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk1 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk1 from r2-r5 to r6-r9
+	movw 	r2, #0xf0f0
+	movt 	r2, #0xf0f0 				//r2<- 0xf0f0f0f0
+	and 	r11, r8, r2 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r12, r9, r2 				//tk &= 0xf0f0f0f0 (4th word)
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 1st round
+	and 	r11, r6, r2 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r12, r7, r2 				//tk &= 0xf0f0f0f0 (2nd word)
+	strd 	r11, r12, [r0], #8  			//store 2nd half tk for 1st round
+
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0x0303
+	movt 	r3, #0x0303 				//r3<- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 4 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 6 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 8 times
+	and 	r11, r2, r6 				//ror and mask to match fixslicing
+	and 	r12, r2, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 9th round
+	and 	r11, r2, r8 				//ror and mask to match fixslicing
+	and 	r12, r2, r9 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 9th round
+	bl 		p2 							//apply the permutation 10
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 12 times
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 14 times
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0] 					//ror and masks to match fixslicing ---
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+.align 2
+quadruple_round:
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #30 	// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #16		// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #10		// --- mixcolumns 2 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #26
+	eor 	r2, r2, r8
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia 	r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	and 	r8, r7, r2, ror #4		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	bx 		lr
+
+/******************************************************************************
+* Encrypt a single block using fixsliced SKINNY-128-128.
+******************************************************************************/
+@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
+.global skinny128_384
+.type   skinny128_384,%function
+.align 2
+skinny128_384:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	movw 	r7, #0x3030
+	movt 	r7, #0x3030 			//r7 <- 0x30303030
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r6
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r6
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 	//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r6
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r6
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r7
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r7, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r7, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r7, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r7, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r7, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #128 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	eor 	r10, r3, r5, lsr #2
+	and 	r10, r10, r7, lsr #4
+	eor 	r3, r3, r10
+	eor 	r5, r5, r10, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	eor 	r10, r4, r5, lsr #4
+	and 	r10, r10, r7, lsr #4
+	eor 	r4, r4, r10
+	eor 	r5, r5, r10, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r10, r4, r3, lsr #2
+	and 	r10, r10, r7, lsr #2
+	eor 	r4, r4, r10
+	eor 	r3, r3, r10, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r10, r2, r5, lsr #6
+	and 	r10, r10, r7, lsr #4
+	eor 	r2, r2, r10
+	eor 	r5, r5, r10, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r10, r2, r3, lsr #4
+	and 	r10, r10, r7, lsr #2
+	eor 	r2, r2, r10
+	eor 	r3, r3, r10, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r10, r2, r4, lsr #2
+	and 	r10, r10, r7
+	eor 	r2, r2, r10
+	eor 	r4, r4, r10, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r10, r5, r5, lsr #3
+	and 	r10, r10, r6
+	eor 	r5, r5, r10
+	eor 	r5, r5, r10, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r10, r4, r4, lsr #3
+	and 	r10, r10, r6
+	eor 	r4, r4, r10
+	eor 	r4, r4, r10, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r10, r3, r3, lsr #3
+	and 	r10, r10, r6
+	eor 	r3, r3, r10
+	eor 	r3, r3, r10, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r10, r2, r2, lsr #3
+	and 	r10, r10, r6
+	eor 	r2, r2, r10
+	eor 	r2, r2, r10, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr
+    
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc_NEC/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc_NEC/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/crypto_aead.h
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/encrypt.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/genkat_aead.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/genkat_aead.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny.h
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule2.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule2.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule3.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule3.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_main.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_main.c
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/api.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/crypto_aead.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/encrypt.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/encrypt.c
+#include "skinny128.h"
+#include "tk_schedule.h"
+#include "romulus.h"
+#include <string.h>
+#include <stdio.h>
+
+//Encryption and authentication using Romulus-N1
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    int i;
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                          //56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    //Handle the special case of no associated data
+    if (adlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+    } else {
+        // Process all double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        //Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {
+            // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {
+            //  Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {
+            //  Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {
+            // Left-over partial single block
+            for(i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;
+            SET_DOMAIN(tks, 0x1A);
+        }
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the plaintext -----------------
+    memset(tks.tk1, 0, KEYBYTES);
+    tks.tk1[0] = 0x01;          //init the 56-bit LFSR counter
+    if (mlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        //process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (mlen > BLOCKBYTES) {
+            RHO(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            mlen -= BLOCKBYTES;
+        }
+        //pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (mlen < BLOCKBYTES) {
+            for(i = 0; i < (int)mlen; i++) {
+                tmp = m[i];         //use of tmp variable just in case 'c = m'
+                c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= (u8)tmp;
+            }
+            state[15] ^= (u8)mlen; //padding
+            SET_DOMAIN(tks, 0x15);
+        } else {
+            RHO(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+        c += mlen;
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate the tag -----------------
+    G(state,state);
+    memcpy(c, state, TAGBYTES);
+    // ----------------- Generate the tag -----------------
+
+    return 0;
+}
+
+
+//Decryption and tag verification using Romulus-N1
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k) {
+
+    int i;
+    u32 tmp;
+    skinny_128_384_tks tks;
+    u8 state[BLOCKBYTES], pad[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    *mlen = clen - TAGBYTES;
+    memset(tks.tk1, 0x00, KEYBYTES);
+    memset(state, 0x00, BLOCKBYTES);
+    tks.tk1[0] = 0x01;                          //56-bit LFSR counter
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the associated data -----------------
+    //Handle the special case of no associated data
+    if (adlen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x1A);
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        // Process all double blocks except the last
+        SET_DOMAIN(tks, 0x08);
+        while (adlen > 2*BLOCKBYTES) {
+            UPDATE_CTR(tks.tk1);
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            ad += 2*BLOCKBYTES;
+            adlen -= 2*BLOCKBYTES;
+        }
+        //Pad and process the left-over blocks 
+        UPDATE_CTR(tks.tk1);
+        if (adlen == 2*BLOCKBYTES) {
+            // Left-over complete double block
+            XOR_BLOCK(state, state, ad);
+            precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3); 
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x18);
+        } else if (adlen > BLOCKBYTES) {
+            //  Left-over partial double block
+            adlen -= BLOCKBYTES;
+            XOR_BLOCK(state, state, ad);
+            memcpy(pad, ad + BLOCKBYTES, adlen);
+            memset(pad + adlen, 0x00, 15 - adlen);
+            pad[15] = adlen;
+            precompute_rtk2_3(tks.rtk2_3, pad, k);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);  
+            UPDATE_CTR(tks.tk1);
+            SET_DOMAIN(tks, 0x1A);
+        } else if (adlen == BLOCKBYTES) {
+            //  Left-over complete single block 
+            XOR_BLOCK(state, state, ad);
+            SET_DOMAIN(tks, 0x18);
+        } else {
+            // Left-over partial single block
+            for(i =0; i < (int)adlen; i++)
+                state[i] ^= ad[i];
+            state[15] ^= adlen;
+            SET_DOMAIN(tks, 0x1A);
+        }
+        precompute_rtk2_3(tks.rtk2_3, npub, k);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the associated data -----------------
+
+    // ----------------- Process the ciphertext -----------------
+    clen -= TAGBYTES;
+    memset(tks.tk1, 0, KEYBYTES);
+    tks.tk1[0] = 0x01;          //init the 56-bit LFSR counter
+    if (clen == 0) {
+        UPDATE_CTR(tks.tk1);
+        SET_DOMAIN(tks, 0x15);
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    } else {
+        //process all blocks except the last
+        SET_DOMAIN(tks, 0x04);
+        while (clen > BLOCKBYTES) {
+            RHO_INV(state,c,m);
+            UPDATE_CTR(tks.tk1);
+            precompute_rtk1(tks.rtk1, tks.tk1);
+            skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+            c += BLOCKBYTES;
+            m += BLOCKBYTES;
+            clen -= BLOCKBYTES;
+        }
+        //pad and process the last block
+        UPDATE_CTR(tks.tk1);
+        if (clen < BLOCKBYTES) {
+            for(i = 0; i < (int)clen; i++) {
+                m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
+                state[i] ^= m[i];
+            }
+            state[15] ^= (u8)clen; //padding
+            SET_DOMAIN(tks, 0x15);
+        } else {
+            RHO_INV(state,c,m);
+            SET_DOMAIN(tks, 0x14);
+        }
+        precompute_rtk1(tks.rtk1, tks.tk1);
+        skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Generate and check the tag -----------------
+    G(state,state);
+    tmp = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        tmp |= state[i] ^ c[clen+i];   //constant-time tag comparison
+    // ----------------- Generate and check the tag -----------------
+
+    return tmp;
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/romulus.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/romulus.h
+#ifndef ROMULUSN1_H_
+#define ROMULUSN1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef struct {
+    u8 tk1[16];                         //to manipulate tk1 byte-wise
+    u32 rtk1[4*16];                     //to avoid tk schedule recomputations
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
+} skinny_128_384_tks;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
+
+//G as defined in the Romulus specification in a 32-bit word-wise manner
+#define G(x,y) ({                                                                   \
+    tmp = ((u32*)(y))[0];                                                           \
+    ((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[1];                                                           \
+    ((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[2];                                                           \
+    ((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+    tmp = ((u32*)(y))[3];                                                           \
+    ((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080);   \
+})
+
+//update the counter in tk1 in a 32-bit word-wise manner
+#define UPDATE_CTR(tk1) ({                              \
+    tmp = ((u32*)(tk1))[1];                             \
+    ((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff;         \
+    ((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31);       \
+    ((u32*)(tk1))[1] |= tmp & 0xff000000;               \
+    ((u32*)(tk1))[0] <<= 1;                             \
+    if ((tmp >> 23) & 0x01)                             \
+        ((u32*)(tk1))[0] ^= 0x95;                       \
+})
+
+//x <- y ^ z for 128-bit blocks
+#define XOR_BLOCK(x,y,z) ({                             \
+    ((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0];   \
+    ((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1];   \
+    ((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2];   \
+    ((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3];   \
+})
+
+
+//Rho as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO(x,y,z) ({       \
+    G(pad,x);               \
+    XOR_BLOCK(y, pad, z);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+//Rho inverse as defined in the Romulus specification
+//use pad as a tmp variable in case y = z
+#define RHO_INV(x, y, z) ({ \
+    G(pad, x);              \
+    XOR_BLOCK(z, pad, y);   \
+    XOR_BLOCK(x, x, z);     \
+})
+
+#endif  // ROMULUSN1_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1,  const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.c
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.c
+/******************************************************************************
+* Implementation of the SKINNY tweakey schedule to match fixslicing.
+* 
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h> 		//for memcmp
+#include "tk_schedule.h"
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/******************************************************************************
+* The round constants according to the new representation.
+******************************************************************************/
+u32 rconst_32_bs[224] = {
+	0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
+	0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
+	0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
+	0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
+	0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
+	0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
+	0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
+	0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
+	0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff, 
+	0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
+	0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
+	0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
+	0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
+	0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
+	0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
+	0x00000010, 0x00000000, 0x00010010, 0xfffffbff, 0x00000014, 0xffffffef,
+	0x00000000, 0x00000040, 0x00000100, 0x00000000, 0x10000040, 0xfffffeff,
+	0x44000000, 0xfffffeff, 0x00000000, 0x00000000, 0x00000000, 0x00100000,
+	0x04000001, 0xffffffff, 0x00040000, 0xffffffff, 0x00400000, 0x00000000,
+	0x00000000, 0x00000000, 0x00001000, 0xfebfffff, 0x01004400, 0xffffffff,
+	0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00010000, 0xffffffff,
+	0x00000004, 0xffffffbf, 0x00000040, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffebf, 0x44000100, 0xffffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00000001, 0xffffffff, 0x00040000, 0xffafffff,
+	0x00400000, 0x00000000, 0x00000000, 0x00000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfffffbff, 0x00000000, 0x00000400, 0x00000010, 0x00000000,
+	0x00010010, 0xffffffff
+};
+
+/******************************************************************************
+* 	Pack the input into the bitsliced representation
+* 	24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
+* 	25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
+* 	26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
+* 	27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
+******************************************************************************/
+void packing(u32* out, const u8* in) {
+	u32 tmp;
+	LE_LOAD(out, in);
+	LE_LOAD(out + 1, in + 8);
+	LE_LOAD(out + 2, in + 4);
+	LE_LOAD(out + 3, in + 12);
+	SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[3], out[0], 0x03030303, 6);
+	SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[3], out[2], 0x03030303, 4);
+	SWAPMOVE(out[3], out[1], 0x03030303, 2);
+}
+
+/******************************************************************************
+* Unpack the input to a byte-wise representation
+******************************************************************************/
+void unpacking(u8* out, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[3], in[1], 0x03030303, 2);
+	SWAPMOVE(in[3], in[2], 0x03030303, 4);
+	SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[3], in[0], 0x03030303, 6);
+	SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
+	LE_STORE(out, in[0]);
+	LE_STORE(out + 8, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out + 12, in[3]);
+}
+
+/******************************************************************************
+* 	0 4        1 5
+* 	1 5  --->  2 6
+* 	2 6        3 7
+* 	3 7        4 0
+******************************************************************************/
+void lfsr2_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[0] = tk[1];
+	tk[1] = tk[2];
+	tk[2] = tk[3];
+	tk[3] = tmp;
+}
+
+/******************************************************************************
+* 	0 4        7 3
+* 	1 5  --->  0 4
+* 	2 6        1 5
+* 	3 7        2 6
+******************************************************************************/
+void lfsr3_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[3] = tk[2];
+	tk[2] = tk[1];
+	tk[1] = tk[0];
+	tk[0] = tmp;
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, twice
+******************************************************************************/
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 4 times
+******************************************************************************/
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 6 times
+******************************************************************************/
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 8 times
+******************************************************************************/
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 10 times
+******************************************************************************/
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 12 times
+******************************************************************************/
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 14 times
+******************************************************************************/
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00;
+		tk[i] |= ROR(tmp,14) & 0x00cc0000;
+		tk[i] |= ROR(tmp,30) & 0xcc000000;
+		tk[i] |= ROR(tmp,16) & 0x000000ff;
+		tk[i] |= ROR(tmp,18) & 0x33003300;
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK2
+******************************************************************************/
+void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
+	u32 tk2[4];
+	packing(tk2, key);
+	memcpy(tk, tk2, 16);
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr2_bs(tk2);
+		memcpy(tk+i*4+4, tk2, 16);
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK3
+******************************************************************************/
+void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
+	u32 tk3[4];
+	packing(tk3, key);
+	tk[0] ^= tk3[0];
+	tk[1] ^= tk3[1];
+	tk[2] ^= tk3[2];
+	tk[3] ^= tk3[3];
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr3_bs(tk3);
+		tk[i*4+4] ^= tk3[0];
+		tk[i*4+5] ^= tk3[1];
+		tk[i*4+6] ^= tk3[2];
+		tk[i*4+7] ^= tk3[3];
+	}
+}
+
+/******************************************************************************
+* XOR TK with TK1 before applying the permutations.
+* The key is then rearranged to match the barrel shiftrows representation.
+******************************************************************************/
+void permute_tk(u32* tk, const u8* key, const int rounds) {
+	u32 test;
+	u32 tk1[4], tmp[4];
+	packing(tk1, key);
+	memcpy(tmp, tk, 16);
+	tmp[0] ^= tk1[0];
+	tmp[1] ^= tk1[1];
+	tmp[2] ^= tk1[2];
+	tmp[3] ^= tk1[3];
+	for(int i = 0 ; i < rounds; i += 8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		tk[i*4] = tmp[2] & 0xf0f0f0f0;
+		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
+		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
+		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+4, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 					// applies P^2
+		else
+			permute_tk_10(tmp); 				// applies P^10
+		tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
+		tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
+		tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+12, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 					// applies P^4
+		else
+			permute_tk_12(tmp); 				// applies P^12
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
+			tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
+		}
+		tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
+		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
+		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
+		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+20, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 					//	applies P^6
+		else
+			permute_tk_14(tmp); 				// applies P^14
+		tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
+		tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
+		tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
+		tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
+		tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
+		tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
+		tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+28, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 					// applies P^8
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
+			tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
+		}
+		if (test && (i+8 < rounds)) { 			//only if next loop iteration
+			tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
+			tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
+			tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
+			tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
+		}
+	}
+}
+
+/******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
+******************************************************************************/
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
+	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
+	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
+		for(int j = 0; j < 4; j++)
+			rtk[i*4+j] ^= rconst_32_bs[i*4+j];
+	}
+}
+
+/******************************************************************************
+* Precompute RTK1.
+******************************************************************************/
+void precompute_rtk1(u32* rtk1, const u8* tk1) {
+	memset(rtk1, 0x00, 16*16);
+	permute_tk(rtk1, tk1, 16);
+}
\ No newline at end of file
--- a/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.h
+++ b/romulus/Implementations/crypto_aead/romulusn1v12/opt32/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/LWC_AEAD_KAT_128_128.txt
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/LWC_AEAD_KAT_128_128.txt
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384(c, rtk2_3, m, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        mlen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update lfsr for next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (mlen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        for(i = 0; i < mlen; i++)
+            sum[i] ^= m[i];             // sum for tag computation
+        sum[i] ^= 0x80;                 // padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1); // encrypt 'auth' = 0^16
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];      // encrypted padded block
+        c += mlen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1);  // compute the tag
+    memcpy(c, sum, TAGBYTES);
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(sum, rtk2_3, ad, rtk1);   // use 'sum' as tmp array
+        xor_block(auth, sum);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] = 0x80;              // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(c, auth);                 // XOR for tag computation
+    // ----------------- Process the associated data -----------------
+    
+    return 0;
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384_inv(m, rtk2_3, c, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        clen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update LFSR for the next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (clen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];      // encrypted padded block
+            sum[i] ^= m[i];             // sum for tag computation
+        }
+        sum[i] ^= 0x80;                 // padding
+        c += clen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(tmp + BLOCKBYTES, rtk2_3, ad, rtk1);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(sum, auth);               // XOR for tag computation
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];      // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, const u32* rtk2_3, const u8* ptext, const u32* rtk1);
+extern void skinny128_384_inv(u8* ptext, const u32* rtk2_3, const u8* ctext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk2_3, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk2_3);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/skinny128.s
+/*******************************************************************************
+* ARM assembly implementation of fixsliced SKINNY-128-384.
+*
+* For more details, see the paper at: https://
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+*******************************************************************************/
+
+.syntax unified
+.thumb
+
+/*******************************************************************************
+* applies P^2 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p2:
+	movw 	r1, #0xcc00
+	movt 	r1, #0xcc00 				//r1 <- 0xcc00cc00
+	movw 	r10, #0xcc00
+	movt 	r10, #0x0033 				//r10<- 0xcc000033
+	and 	r11, r1, r6, ror #14
+	bfi 	r11, r6, #16, #8
+	and 	r12, r6, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r6
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r6, #0x00cc0000
+	orr 	r6, r11, r12, lsr #18
+	and 	r11, r1, r7, ror #14
+	bfi 	r11, r7, #16, #8
+	and 	r12, r7, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r7
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r7, #0x00cc0000
+	orr 	r7, r11, r12, lsr #18
+	and 	r11, r1, r8, ror #14
+	bfi 	r11, r8, #16, #8
+	and 	r12, r8, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r8
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r8, #0x00cc0000
+	orr 	r8, r11, r12, lsr #18
+	and 	r11, r1, r9, ror #14
+	bfi 	r11, r9, #16, #8
+	and 	r12, r9, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r9
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r9, #0x00cc0000
+	orr 	r9, r11, r12, lsr #18
+	bx 		lr
+
+/*******************************************************************************
+* applies P^4 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p4:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0x00cc
+	movt 	r14, #0xcc00 				//r14<- 0xcc0000cc
+	movw 	r12, #0xcc00
+	movt 	r12, #0x3300 				//r12<- 0x3300cc00
+	movw 	r11, #0x00cc
+	movt 	r11, #0x00cc 				//r11<- 0x00cc00cc
+ 	and 	r10, r14, r6, ror #22
+ 	and 	r1, r12, r6, ror #16
+ 	orr 	r10, r10,  r1
+ 	and 	r1, r6, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r6, r6, r1
+ 	orr 	r6, r10, r6, ror #24
+ 	and 	r10, r14, r7, ror #22
+ 	and 	r1, r12, r7, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r7, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r7, r7, r1
+ 	orr 	r7, r10, r7, ror #24
+ 	and 	r10, r14, r8, ror #22
+ 	and 	r1, r12, r8, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r8, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r8, r8, r1
+ 	orr 	r8, r10, r8, ror #24
+ 	and 	r10, r14, r9, ror #22
+ 	ldr.w 	r14, [sp] 					//restore r14
+ 	and 	r12, r12, r9, ror #16
+ 	orr 	r10, r10, r12
+ 	and 	r12, r9, r11
+ 	orr 	r10, r10, r12, lsr #2
+	movw 	r12, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r9, r9, r12
+ 	orr 	r9, r10, r9, ror #24
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^6 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p6:
+	movw 	r1, #0x3333 				//r1 <- 0x00003333
+	movw 	r12, #0x00cc
+	movt 	r12, #0x3300 				//r12<- 0x330000cc
+	and 	r10, r6, r1, ror #8 		// --- permute r6 6 times
+	and 	r11, r12, r6, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r6, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r6, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r6, #0x00003300
+	orr 	r6, r11, r10, lsl #2 		// permute r6 6 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r7 6 times
+	and 	r11, r12, r7, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r7, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r7, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r7, #0x00003300
+	orr 	r7, r11, r10, lsl #2 		// permute r7 6 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r8 6 times
+	and 	r11, r12, r8, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r8, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r8, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r8, #0x00003300
+	orr 	r8, r11, r10, lsl #2 		// permute r8 6 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r9 6 times
+	and 	r11, r12, r9, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r9, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r9, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r9, #0x00003300 		// permute r9 6 times ---
+	orr 	r9, r11, r10, lsl #2
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^8 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p8:
+	movw 	r12, #0x3333 				//r12<- 0x00003333
+	movw 	r1, #0x0000
+	movt 	r1, #0x33cc 				//r1 <- 0x33cc0000
+	and 	r10, r6, r1 				// --- permute r6 8 times
+	and 	r11, r1, r6, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r6, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r6, r12, lsl #8
+	orr 	r6, r11, r10, lsr #6 		// permute r6 8 times ---
+	and 	r10, r7, r1 				// --- permute r7 8 times
+	and 	r11, r1, r7, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r7, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r7, r12, lsl #8
+	orr 	r7, r11, r10, lsr #6 		// permute r7 8 times ---
+	and 	r10, r8, r1 				// --- permute r8 8 times
+	and 	r11, r1, r8, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r8, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r8, r12, lsl #8
+	orr 	r8, r11, r10, lsr #6 		// permute r8 8 times ---
+	and 	r10, r9, r1 				// --- permute r9 8 times
+	and 	r11, r1, r9, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r9, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r9, r12, lsl #8
+	orr 	r9, r11, r10, lsr #6 		// permute r9 8 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^10 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p10:
+	movw 	r12, #0x0033
+	movt 	r12, #0x3300 				//r12<- 0x33000033
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+	and 	r10, r6, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r6, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r6, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r6, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r6, #0x0000cc00
+	orr 	r6, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r7, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r7, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r7, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r7, #0x0000cc00
+	orr 	r7, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r8, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r8, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r8, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r8, #0x0000cc00
+	orr 	r8, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r9, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r9, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r9, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r9, #0x0000cc00
+	orr 	r9, r11, r10, lsr #2 		// permute r6 10 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^12 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p12:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0xcc33 				//r14<- 0x0000cc33
+	movw 	r12, #0x00cc
+	movt 	r12, #0x00cc 				//r12<- 0x00cc00cc
+	movw 	r1, #0x3300
+	movt 	r1, #0xcc00 				//r1 <- 0xcc003300
+	and 	r10, r14, r6, ror #8 		// --- permute r6 12 times
+	and 	r11, r12, r6, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r6, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r6, r10, ror #8
+	orr 	r6, r11, r10, ror #10 		// permute r6 12 times ---
+	and 	r10, r14, r7, ror #8 		// --- permute r7 12 times
+	and 	r11, r12, r7, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r7, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r7, r10, ror #8
+	orr 	r7, r11, r10, ror #10 		// permute r7 12 times ---
+	and 	r10, r14, r8, ror #8 		// --- permute r8 12 times
+	and 	r11, r12, r8, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r8, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r8, r10, ror #8
+	orr 	r8, r11, r10, ror #10 		// permute r8 12 times ---
+	and 	r10, r14, r9, ror #8 		// --- permute r9 12 times
+	and 	r11, r12, r9, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r9, ror #16
+	ldr.w 	r14, [sp]
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r9, r10, ror #8
+	orr 	r9, r11, r10, ror #10 		// permute r9 12 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^14 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p14:
+	movw 	r1, #0xcc00
+	movt 	r1, #0x0033 				//r1 <- 0x0033cc00
+	movw 	r12, #0xcc00
+	movt 	r12, #0xcc00 				//r12<- 0x33003300
+	and 	r10, r1, r6, ror #24 		// --- permute r6 14 times
+	and 	r11, r6, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r6, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r6, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r6, r12
+	orr 	r6, r11, r10, ror #18 		// permute r6 14 times ---
+	and 	r10, r1, r7, ror #24 		// --- permute r7 14 times
+	and 	r11, r7, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r7, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r7, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r7, r12
+	orr 	r7, r11, r10, ror #18 		// permute r7 14 times ---
+	and 	r10, r1, r8, ror #24 		// --- permute r8 14 times
+	and 	r11, r8, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r8, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r8, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r8, r12
+	orr 	r8, r11, r10, ror #18 		// permute r8 14 times ---
+	and 	r10, r1, r9, ror #24 		// --- permute r9 14 times
+	and 	r11, r9, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r9, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r9, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r9, r12
+	orr 	r9, r11, r10, ror #18 		// permute r9 14 times ---
+ 	bx 		lr
+
+.align 2
+packing:
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r10
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 		//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r10
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r10
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 		//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r10
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 		//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r11
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 		//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r11, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 		//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r11, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 		//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r11, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 		//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r11, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 		//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r11, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 		//SWAPMOVE(r5, r3, 0x03030303, 2)
+	bx 		lr
+
+.align 2
+unpacking:
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	eor 	r10, r3, r5, lsr #2
+	and 	r10, r10, r7, lsr #4
+	eor 	r3, r3, r10
+	eor 	r5, r5, r10, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	eor 	r10, r4, r5, lsr #4
+	and 	r10, r10, r7, lsr #4
+	eor 	r4, r4, r10
+	eor 	r5, r5, r10, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r10, r4, r3, lsr #2
+	and 	r10, r10, r7, lsr #2
+	eor 	r4, r4, r10
+	eor 	r3, r3, r10, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r10, r2, r5, lsr #6
+	and 	r10, r10, r7, lsr #4
+	eor 	r2, r2, r10
+	eor 	r5, r5, r10, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r10, r2, r3, lsr #4
+	and 	r10, r10, r7, lsr #2
+	eor 	r2, r2, r10
+	eor 	r3, r3, r10, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r10, r2, r4, lsr #2
+	and 	r10, r10, r7
+	eor 	r2, r2, r10
+	eor 	r4, r4, r10, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r10, r5, r5, lsr #3
+	and 	r10, r10, r6
+	eor 	r5, r5, r10
+	eor 	r5, r5, r10, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r10, r4, r4, lsr #3
+	and 	r10, r10, r6
+	eor 	r4, r4, r10
+	eor 	r4, r4, r10, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r10, r3, r3, lsr #3
+	and 	r10, r10, r6
+	eor 	r3, r3, r10
+	eor 	r3, r3, r10, lsl #3 	//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r10, r2, r2, lsr #3
+	and 	r10, r10, r6
+	eor 	r2, r2, r10
+	eor 	r2, r2, r10, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	bx 		lr
+
+/******************************************************************************
+* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
+* Performing both at the same time allows to save some memory accesses.
+******************************************************************************/
+@ void 	tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
+.global tkschedule_lfsr
+.type   tkschedule_lfsr,%function
+.align	2
+tkschedule_lfsr:
+	push 	{r0-r12, r14}
+	ldr.w 	r3, [r1, #8] 				//load tk2 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk2 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk2 (4th word)
+	ldr.w 	r12, [r1] 					//load tk2 (1st word)
+	mov 	r1, r2 						//move tk3 address in r1
+	mov 	r2, r12 					//move 1st tk2 word in r2
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk2
+	mov 	r6, r2 						//move tk2 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk2 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk2 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk2 from r2-r5 to r6-r9
+	ldr.w 	r3, [r1, #8] 				//load tk3 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk3 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk3 (4th) word)
+	ldr.w 	r2, [r1] 					//load tk3 (1st) word)
+	bl 		packing 					//pack tk3
+	eor 	r10, r10, r10, lsl #4 		//r10<- 0xaaaaaaaa
+	ldr.w 	r1, [sp, #12] 				//load loop counter in r1
+	eor 	r11, r2, r6 				//tk2 ^ tk3 (1st word)
+	eor 	r12, r3, r7 				//tk2 ^ tk3 (2nd word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	eor 	r11, r4, r8 				//tk2 ^ tk3 (3rd word)
+	eor 	r12, r5, r9					//tk2 ^ tk3 (4th word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	loop:
+		and 	r12, r8, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r6
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r6, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r3, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r5, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r5, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r2, r8 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r4, r6				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r9, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r7
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r7, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r2, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r4, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r4, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r5, r9 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r3, r7				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r6, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r8
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r8, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r5, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r3, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r3, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r4, r6 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r2, r8				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r7, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r9
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r9, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r4, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r2, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r2, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r3, r7 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r5, r9				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		subs.w 	r1, r1, #8 				//decrease loop counter by 8
+		bne 	loop
+	pop 	{r0-r12, r14}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutation P and add the round constants to all round tweakeys.
+******************************************************************************/
+@ void 	tkschedule_perm(u32* tk)
+.global tkschedule_perm
+.type   tkschedule_perm,%function
+.align	2
+tkschedule_perm:
+	push 	{r0-r12, lr}
+	sub.w 	sp, #4 						//to store r14 in subroutines
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r6, r6, r10 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r7, r7, r10 				//tk &= 0xf0f0f0f0 (2nd word)
+	and 	r8, r8, r10 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r9, r9, r10 				//tk &= 0xf0f0f0f0 (4th word)
+	eor 	r8, r8, #0x00000004 		//add rconst
+	eor 	r9, r9, #0x00000040 		//add rconst
+	mvn 	r9, r9 						//to remove a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 1st round
+	strd 	r6, r7, [r0], #8  			//store 2nd half tk for 1st round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 3rd round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 3rd round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 5th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 5th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 4th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 4th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 7th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 7th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 9th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000054 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 9th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000410
+	eor 	r9, r9, #0x00000410
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 8th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 8th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x04000000 		//add rconst
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 11th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 11th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 13th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00140000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 13th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 12th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 12th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add const
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 15th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 15th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 17th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 17th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 16th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 16th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 18th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 18th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 19th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 19th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 21th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00100000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 21th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 20th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 20th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 22th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 22th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 23th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 23th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 25th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 25th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 24th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 24th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 26th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 26th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 27th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 27th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 29th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 29th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 28th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 28th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 30th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 30th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 31th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 31th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 33th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 33th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 32th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 32th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 34th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 34th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 35th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 35th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 37th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 37th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 36th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 36th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 38th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 38th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 39th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 39th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0x3030
+	movt 	r10, #0x3030 				//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 39th round
+	strd 	r8, r9, [r0] 				//store 2nd half tk for 39th round
+	add.w 	sp, #4 						//restore stack pointer
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
+* dont need more calculations as no LFSR is applied to TK1.
+******************************************************************************/
+@ void 	tkschedule_perm_tk1(u32* tk, const u8* key)
+.global tkschedule_perm_tk1
+.type   tkschedule_perm_tk1,%function
+.align	2
+tkschedule_perm_tk1:
+	push 	{r0-r12, lr}
+	ldr.w 	r3, [r1, #8] 				//load tk1 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk1 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk1 (4th word)
+	ldr.w 	r2, [r1] 					//load tk1 (1st word)
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r6 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk1
+	mov 	r6, r2 						//move tk1 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk1 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk1 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk1 from r2-r5 to r6-r9
+	movw 	r2, #0xf0f0
+	movt 	r2, #0xf0f0 				//r2<- 0xf0f0f0f0
+	and 	r11, r8, r2 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r12, r9, r2 				//tk &= 0xf0f0f0f0 (4th word)
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 1st round
+	and 	r11, r6, r2 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r12, r7, r2 				//tk &= 0xf0f0f0f0 (2nd word)
+	strd 	r11, r12, [r0], #8  		//store 2nd half tk for 1st round
+	movw 	r3, #0x3030
+	movt 	r3, #0x3030 				//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0, #224]
+	and 	r11, r3, r7, ror #30
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0, #228]
+	and 	r11, r3, r8, ror #30
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0, #232]
+	and 	r11, r3, r9, ror #30
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #22		//ror and masks to match fixslicing ---
+	str.w 	r12, [r0, #236]
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3 <- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r3, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r3, r3, r3, lsr #6 			//r3<- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 4 times
+	lsl 	r3, r3, #4 					//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r7, ror #14
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r8, ror #14
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r9, ror #14
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #6		//ror and masks to match fixslicing ---
+	str.w 	r12, [r0], #4
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3<- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r3, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r3, r3, r3, lsr #6 			//r3<- 0x03030303
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 8 times
+	lsl 	r3, r3, #4 					//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r7, ror #30
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r8, ror #30
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r9, ror #30
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #22		//ror and masks to match fixslicing ---
+	str.w 	r12, [r0], #4
+	and 	r11, r2, r6 				//ror and mask to match fixslicing
+	and 	r12, r2, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 9th round
+	and 	r11, r2, r8 				//ror and mask to match fixslicing
+	and 	r12, r2, r9 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 9th round
+	bl 		p2 							//apply the permutation 10
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3 <- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r3, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r3, r3, r3, lsr #6 			//r3 <- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 12 times
+	lsl 	r3, r3, #4 					//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r7, ror #14
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r8, ror #14
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r9, ror #14
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #6		//ror and masks to match fixslicing ---
+	str.w 	r12, [r0], #4
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 14 times
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3 <- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r3, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r3, r3, r3, lsr #6 			//r3 <- 0x03030303
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+.align 2
+quadruple_round:
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #30 	// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #16		// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #10		// --- mixcolumns 2 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #26
+	eor 	r2, r2, r8
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #4		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	bx 		lr
+
+/******************************************************************************
+* Inverse quadruple round of fixsliced SKINNY-128.
+******************************************************************************/
+.align 	2
+inv_quadruple_round:
+	and 	r8, r7, r2, ror #4 		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	and 	r8, r7, r2, ror #26 	// --- mixcolumns 2 ---
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #10
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rk2_3 + rconst
+	eor 	r3, r3, r9 				//add rk2_3 + rconst
+	eor 	r4, r4, r10 			//add rk2_3 + rconst
+	eor 	r5, r5, r11 			//add rk2_3 + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	and 	r8, r7, r2, ror #16 	// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	and 	r8, r7, r2, ror #6 		// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #30
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	bx 		lr
+
+/******************************************************************************
+* Encrypt a single block using fixsliced SKINNY-128-384+.
+******************************************************************************/
+@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
+.global skinny128_384
+.type   skinny128_384,%function
+.align 2
+skinny128_384:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 			//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 			//r11 <- 0x30303030
+	bl 		packing
+	mov 	r7, r11
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		unpacking
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr   
+
+/******************************************************************************
+* Decrypt a single block using fixsliced SKINNY-128-384+.
+******************************************************************************/
+@ void skinny128_384_inv(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
+.global skinny128_384_inv
+.type   skinny128_384_inv,%function
+.align 2
+skinny128_384_inv:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 			//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 			//r11 <- 0x30303030
+	bl 		packing
+	mov 	r7, r11
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	add.w 	r0, #120 				// points to the right rtk1
+	add.w 	r1, #632 				// points to the last rtk2_3
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	add.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	add.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		unpacking
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_1/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
+* functions.
+******************************************************************************/
+static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, u32* rtk1,
+                    u32* rtk2_3, u64 mlen, const u8* ad, u64 adlen) {
+    u64 lfsr = 1;
+    u8 feedback;
+    u8 tmp[2*BLOCKBYTES];
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    SET_DOMAIN(tmp, 0x02);
+    while (adlen >= 2*BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
+        tkschedule_perm_tk1(rtk1, tmp, tmp+BLOCKBYTES);
+        skinny128_384(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= 2*BLOCKBYTES;
+        ad += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > BLOCKBYTES) {                       // pad and process 2 blocs
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x03);         // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        memset(tmp, 0x00, BLOCKBYTES);
+        memcpy(tmp, ad + BLOCKBYTES, adlen);
+        tmp[adlen] ^= 0x80;                         // padding
+        skinny128_384(tmp + BLOCKBYTES, tmp, ad, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+    } else if (adlen == BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        if (mlen == 0) {                // if tag has *NOT* been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp, tag);
+            skinny128_384(auth, c, ad, c, rtk1, rtk2_3); 
+        } else {                        // if tag has  been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp, tmp);    // process last ad block
+            skinny128_384(auth, auth, ad, ad, rtk1, rtk2_3);
+        }
+    } else if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);                      // domain for padding ad
+        memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
+        memcpy(tmp + BLOCKBYTES, ad, adlen);        // padding
+        tmp[BLOCKBYTES + adlen] ^= 0x80;            // padding
+        if (mlen == 0) {                // if tag has *NOT* been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp, tag);    // compute the tag
+            skinny128_384(auth, c, tmp + BLOCKBYTES, c, rtk1, rtk2_3); 
+        } else {                        // if tag has been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp,  tmp);   // process last ad block
+            skinny128_384(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, rtk1, rtk2_3);
+        }
+    }
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u8 feedback;
+    u64 i,lfsr = 1;
+    u32 rtk1[8*16];
+    u32 rtk2_3[8*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
+    tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(c, c + BLOCKBYTES, m, m + BLOCKBYTES, rtk1, rtk2_3);
+        xor_block(sum, m);                 // sum for tag computation
+        xor_block(sum, m + BLOCKBYTES);    // sum for tag computation
+        mlen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (mlen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x01); // domain for padding m
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(c, auth, m, auth, rtk1, rtk2_3);
+        xor_block(sum, m);
+        for(i = 0; i < mlen - BLOCKBYTES; i++) {
+            c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
+            sum[i] ^= m[BLOCKBYTES + i]; 
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        m += mlen;
+        c += mlen;
+        mlen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (mlen == BLOCKBYTES) {        // last block is full
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x04); // domain for tag computation
+        xor_block(sum, m);                  // sum for tag computation
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(c, sum, m, sum, rtk1, rtk2_3);
+        c += BLOCKBYTES;
+    } else if (mlen > 0) {                  // last block is partial
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x05); // domain for tag computation
+        for(i = 0; i < mlen; i++)           // sum for tag computation
+            sum[i] ^= m[i];                 // sum for tag computation
+        sum[i] ^= 0x80;                     // padding
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(auth, sum, auth, sum, rtk1, rtk2_3);
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];          // encrypted padded block
+        c += mlen;
+    }
+    if (mlen == 0) {            // if tag has *NOT* been calculated yet 
+        LE_STR_64(tag, lfsr);   // lfsr for tag computation                            
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
+            tkschedule_perm_tk1(rtk1, tag, tag);
+            skinny128_384(sum, sum, sum, sum,  rtk1, rtk2_3); // compute the tag
+        }
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, mlen, ad, adlen);
+    xor_block(sum, auth);
+    memcpy(c, sum, TAGBYTES);
+    // ----------------- Process the associated data -----------------
+
+    return 0;
+}
+
+
+/******************************************************************************
+* Decryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u8 feedback;
+    u64 i,lfsr = 1;
+    u32 rtk1[8*16];
+    u32 rtk2_3[8*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES];
+    u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
+    tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_inv(m, m + BLOCKBYTES, c, c + BLOCKBYTES, rtk1, rtk2_3);
+        xor_block(sum, m);                 // sum for tag computation
+        xor_block(sum, m + BLOCKBYTES);    // sum for tag computation
+        clen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (clen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
+        xor_block(sum, m);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x01);              // domain for padding m
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < clen - BLOCKBYTES; i++) {
+            m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
+            sum[i] ^= m[BLOCKBYTES + i]; 
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        c += clen;
+        clen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (clen == BLOCKBYTES) {        // last block is full
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
+        xor_block(sum, m);                  // sum for tag computation
+        SET_DOMAIN(tag, 0x04);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        c += BLOCKBYTES;
+        clen = 0;
+    } else if (clen > 0) {                  // last block is partial
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];          // encrypted padded block
+            sum[i] ^= m[i];                 // sum for tag computation
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        c += clen;
+        clen = 0;
+    }
+    if (clen == 0) {                    // if tag has *NOT* been calculated yet
+        LE_STR_64(tag, lfsr);
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
+            tkschedule_perm_tk1(rtk1, tag, tag); //if AD can be processed in //
+            skinny128_384(sum, sum, sum, sum, rtk1, rtk2_3); // compute the tag
+        }
+    }
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, clen, ad, adlen);
+    xor_block(sum, auth);
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];  // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	40
+
+extern void skinny128_384(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void skinny128_384_inv(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void tkschedule_lfsr_2(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void pack_tk1(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void tkschedule_lfsr_3(u32* rtk, const u8* tk3, const u8* tk3_bis, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/skinny128.s
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/armcortexm_2/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    precompute_rtk2_3(rtk2_3, npub, k);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(c + mlen, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);     // precompute RTK1 given the LFSR
+        skinny128_384_plus_encrypt(c, m, rtk1, rtk2_3);
+        xor_block(c + mlen, m);         // sum for tag computation
+        mlen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update lfsr for next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (mlen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        for(i = 0; i < mlen; i++)
+            c[mlen + i] ^= m[i];        // sum for tag computation
+        c[mlen + i] ^= 0x80;            // padding
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];      // encrypted padded block
+        c += mlen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    precompute_rtk1(rtk1, tmp);
+    skinny128_384_plus_encrypt(c, c, rtk1, rtk2_3); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        precompute_rtk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384_plus_encrypt(tmp, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+    }
+    xor_block(c, auth);                 // XOR for tag computation
+    // ----------------- Process the associated data -----------------
+    return 0;
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    precompute_rtk2_3(rtk2_3, npub, k);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);     // precompute RTK1 given the LFSR
+        skinny128_384_plus_decrypt(m, c, rtk1, rtk2_3);
+        xor_block(sum, m);              // sum for tag computation
+        clen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update LFSR for the next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (clen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];      // encrypted padded block
+            sum[i] ^= m[i];             // sum for tag computation
+        }
+        sum[i] ^= 0x80;                 // padding
+        c += clen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    precompute_rtk1(rtk1, tmp);
+    skinny128_384_plus_encrypt(sum, sum, rtk1, rtk2_3); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_plus_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        precompute_rtk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384_plus_encrypt(tmp, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+    }
+    xor_block(sum, auth);               // XOR for tag computation
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];      // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, 
+				const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
+
+/******************************************************************************
+* Decryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, 
+				const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+#include "tk_schedule.h"
+
+void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({		\
+	inv_mixcolumns_3(state);							\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	inv_mixcolumns_2(state); 							\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	inv_mixcolumns_1(state); 							\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	inv_mixcolumns_0(state); 							\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+})
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c
+/******************************************************************************
+* Implementation of the SKINNY tweakey schedule to match fixslicing.
+* 
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h> 		//for memcmp
+#include "tk_schedule.h"
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/******************************************************************************
+* The round constants according to the new representation.
+******************************************************************************/
+u32 rconst_32_bs[160] = {
+	0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
+	0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
+	0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
+	0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
+	0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
+	0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
+	0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
+	0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
+	0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff, 
+	0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
+	0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
+	0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
+	0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
+	0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
+	0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
+	0x00000010, 0x00000000, 0x00010010, 0xfffffbff
+};
+
+/******************************************************************************
+* 	Pack the input into the bitsliced representation
+* 	24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
+* 	25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
+* 	26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
+* 	27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
+******************************************************************************/
+void packing(u32* out, const u8* in) {
+	u32 tmp;
+	LE_LOAD(out, in);
+	LE_LOAD(out + 1, in + 8);
+	LE_LOAD(out + 2, in + 4);
+	LE_LOAD(out + 3, in + 12);
+	SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[3], out[0], 0x03030303, 6);
+	SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[3], out[2], 0x03030303, 4);
+	SWAPMOVE(out[3], out[1], 0x03030303, 2);
+}
+
+/******************************************************************************
+* Unpack the input to a byte-wise representation
+******************************************************************************/
+void unpacking(u8* out, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[3], in[1], 0x03030303, 2);
+	SWAPMOVE(in[3], in[2], 0x03030303, 4);
+	SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[3], in[0], 0x03030303, 6);
+	SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
+	LE_STORE(out, in[0]);
+	LE_STORE(out + 8, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out + 12, in[3]);
+}
+
+/******************************************************************************
+* 	0 4        1 5
+* 	1 5  --->  2 6
+* 	2 6        3 7
+* 	3 7        4 0
+******************************************************************************/
+void lfsr2_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[0] = tk[1];
+	tk[1] = tk[2];
+	tk[2] = tk[3];
+	tk[3] = tmp;
+}
+
+/******************************************************************************
+* 	0 4        7 3
+* 	1 5  --->  0 4
+* 	2 6        1 5
+* 	3 7        2 6
+******************************************************************************/
+void lfsr3_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[3] = tk[2];
+	tk[2] = tk[1];
+	tk[1] = tk[0];
+	tk[0] = tmp;
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, twice
+******************************************************************************/
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 4 times
+******************************************************************************/
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 6 times
+******************************************************************************/
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 8 times
+******************************************************************************/
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 10 times
+******************************************************************************/
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 12 times
+******************************************************************************/
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 14 times
+******************************************************************************/
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00;
+		tk[i] |= ROR(tmp,14) & 0x00cc0000;
+		tk[i] |= ROR(tmp,30) & 0xcc000000;
+		tk[i] |= ROR(tmp,16) & 0x000000ff;
+		tk[i] |= ROR(tmp,18) & 0x33003300;
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK2
+******************************************************************************/
+void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
+	u32 tk2[4];
+	packing(tk2, key);
+	memcpy(tk, tk2, 16);
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr2_bs(tk2);
+		memcpy(tk+i*4+4, tk2, 16);
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK3
+******************************************************************************/
+void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
+	u32 tk3[4];
+	packing(tk3, key);
+	tk[0] ^= tk3[0];
+	tk[1] ^= tk3[1];
+	tk[2] ^= tk3[2];
+	tk[3] ^= tk3[3];
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr3_bs(tk3);
+		tk[i*4+4] ^= tk3[0];
+		tk[i*4+5] ^= tk3[1];
+		tk[i*4+6] ^= tk3[2];
+		tk[i*4+7] ^= tk3[3];
+	}
+}
+
+/******************************************************************************
+* XOR TK with TK1 before applying the permutations.
+* The key is then rearranged to match the barrel shiftrows representation.
+******************************************************************************/
+void permute_tk(u32* tk, const u8* key, const int rounds) {
+	u32 test;
+	u32 tk1[4], tmp[4];
+	packing(tk1, key);
+	memcpy(tmp, tk, 16);
+	tmp[0] ^= tk1[0];
+	tmp[1] ^= tk1[1];
+	tmp[2] ^= tk1[2];
+	tmp[3] ^= tk1[3];
+	for(int i = 0 ; i < rounds; i += 8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		tk[i*4] = tmp[2] & 0xf0f0f0f0;
+		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
+		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
+		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+4, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 					// applies P^2
+		else
+			permute_tk_10(tmp); 				// applies P^10
+		tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
+		tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
+		tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+12, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 					// applies P^4
+		else
+			permute_tk_12(tmp); 				// applies P^12
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
+			tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
+		}
+		tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
+		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
+		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
+		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+20, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 					//	applies P^6
+		else
+			permute_tk_14(tmp); 				// applies P^14
+		tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
+		tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
+		tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
+		tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
+		tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
+		tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
+		tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+28, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 					// applies P^8
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
+			tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
+		}
+		if (test && (i+8 < rounds)) { 			//only if next loop iteration
+			tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
+			tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
+			tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
+			tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
+		}
+	}
+}
+
+/******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
+******************************************************************************/
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
+	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
+	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
+		for(int j = 0; j < 4; j++)
+			rtk[i*4+j] ^= rconst_32_bs[i*4+j];
+	}
+}
+
+/******************************************************************************
+* Precompute RTK1.
+******************************************************************************/
+void precompute_rtk1(u32* rtk1, const u8* tk1) {
+	memset(rtk1, 0x00, 16*16);
+	permute_tk(rtk1, tk1, 16);
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
+* functions.
+******************************************************************************/
+static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
+                    u64 mlen, const u8* ad, u64 adlen) {
+    u64 lfsr = 1;
+    u8 feedback;
+    u8 tmp[2*BLOCKBYTES];
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    SET_DOMAIN(tmp, 0x02);
+    while (adlen >= 2*BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
+        precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
+        skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= 2*BLOCKBYTES;
+        ad += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > BLOCKBYTES) {               // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x03); // domain for padding ad
+        precompute_rtk1(tk->rtk1, tmp, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        memset(tmp, 0x00, BLOCKBYTES);
+        memcpy(tmp, ad + BLOCKBYTES, adlen);
+        tmp[adlen] ^= 0x80;                 // padding
+        skinny128_384_plus_encrypt(tmp + BLOCKBYTES, tmp, ad, tmp, *tk);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+    } else if (adlen == BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        if (mlen == 0) {    // if tag has *NOT* been calculated yet
+            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
+            skinny128_384_plus_encrypt(auth, c, ad, c, *tk); 
+        } else {            // if tag has  been calculated yet
+            precompute_rtk1(tk->rtk1, tmp, tmp);    // process last ad block
+            skinny128_384_plus_encrypt(auth, auth, ad, ad, *tk);
+        }
+    } else if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);                      // domain for padding ad
+        memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
+        memcpy(tmp + BLOCKBYTES, ad, adlen);        // padding
+        tmp[BLOCKBYTES + adlen] ^= 0x80;            // padding
+        if (mlen == 0) {    // if tag has *NOT* been calculated yet
+            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
+            skinny128_384_plus_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); 
+        } else {            // if tag has been calculated yet
+            precompute_rtk1(tk->rtk1, tmp,  tmp);   // process last ad block
+            skinny128_384_plus_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
+        }
+    }
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    tweakey tk;
+    u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(c + mlen, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);  // lfsr for 2nd block
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_plus_encrypt(c, c + BLOCKBYTES, m, m + BLOCKBYTES, tk);
+        xor_block(c + mlen, m);                 // sum for tag computation
+        xor_block(c + mlen, m + BLOCKBYTES);    // sum for tag computation
+        mlen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (mlen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);  // lfsr for 2nd block
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x01);       // domain for padding m
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_plus_encrypt(c, auth, m, auth, tk);
+        xor_block(c + mlen, m);
+        for(i = 0; i < mlen - BLOCKBYTES; i++) {
+            c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
+            c[mlen + i] ^= m[BLOCKBYTES + i]; 
+        }
+        c[mlen + i] ^= 0x80;                    // padding
+        SET_DOMAIN(tag, 0x05);                  // domain for tag computation
+        m += mlen;
+        c += mlen;
+        mlen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (mlen == BLOCKBYTES) {            // last block is full
+        LE_STR_64(tmp, lfsr);                   // lfsr for last full block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);      // lfsr for tag computation
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x04);     // domain for tag computation
+        xor_block(c + mlen, m);                 // sum for tag computation
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_plus_encrypt(c, c + mlen, m, c + mlen, tk);
+        c += BLOCKBYTES;
+    } else if (mlen > 0) {                      // last block is partial
+        LE_STR_64(tmp, lfsr);               // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);       // lfsr for tag computation
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x05);      // domain for tag computation
+        for(i = 0; i < mlen; i++)  // sum for tag computation
+            c[mlen + i] ^= m[i];                // sum for tag computation
+        c[mlen + i] ^= 0x80;                    // padding
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_plus_encrypt(auth, c + mlen, auth, c + mlen, tk);
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];               // encrypted padded block
+        c += mlen;
+    }
+    if (mlen == 0) {    // if tag has *NOT* been calculated yet 
+        LE_STR_64(tag, lfsr);               // lfsr for tag computation                                     
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {    //if all AD can be processed in //
+            precompute_rtk1(tk.rtk1, tag, tag);
+            skinny128_384_plus_encrypt(c, c, c, c, tk); // compute the tag
+        }
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, c, tag, &tk, mlen, ad, adlen);
+    xor_block(c, auth);
+    // ----------------- Process the associated data -----------------
+
+    return 0;
+}
+
+
+/******************************************************************************
+* Decryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    tweakey tk;
+    u8 tmp[2*BLOCKBYTES];
+    u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);  // lfsr for 2nd block
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_plus_decrypt(m, m + BLOCKBYTES, c, c + BLOCKBYTES, tk);
+        xor_block(sum, m);                 // sum for tag computation
+        xor_block(sum, m + BLOCKBYTES);    // sum for tag computation
+        clen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (clen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_plus_decrypt(m, m, c, c, tk);
+        xor_block(sum, m);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp, lfsr);               // lfsr for 2nd block
+        SET_DOMAIN(tmp, 0x01);              // domain for padding m
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_plus_encrypt(auth, auth, auth, auth, tk);
+        for(i = 0; i < clen - BLOCKBYTES; i++) {
+            m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
+            sum[i] ^= m[BLOCKBYTES + i]; 
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        m += clen;
+        c += clen;
+        clen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (clen == BLOCKBYTES) {        // last block is full
+        LE_STR_64(tmp, lfsr);               // lfsr for last full block
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_plus_decrypt(m, m, c, c, tk);
+        xor_block(sum, m);                  // sum for tag computation
+        SET_DOMAIN(tag, 0x04);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        c += BLOCKBYTES;
+        clen = 0;
+    } else if (clen > 0) {                  // last block is partial
+        LE_STR_64(tmp, lfsr);               // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_plus_encrypt(auth, auth, auth, auth, tk);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];          // encrypted padded block
+            sum[i] ^= m[i];                 // sum for tag computation
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        m += clen;
+        c += clen;
+        clen = 0;
+    }
+    if (clen == 0) {                // if tag has *NOT* been calculated yet
+        LE_STR_64(tag, lfsr);       // lfsr for tag computation                        
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
+            precompute_rtk1(tk.rtk1, tag, tag); //if AD can be processed in //
+            skinny128_384_plus_encrypt(sum, sum, sum, sum, tk); // compute the tag
+        }
+    }
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, sum, tag, &tk, clen, ad, adlen);
+    xor_block(sum, auth);
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];  // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c
+/******************************************************************************
+* Fixsliced implementation of SKINNY-128-384.
+* Two blocks are processed in parallel.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. However, the Sbox
+* permutation requires 8 rounds for a synchronization. To limit the impact
+* on code size, we compute the permutation every 4 rounds. Therefore, this
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* For more details, see the paper at: https://
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 0.
+****************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 1.
+****************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 2.
+****************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 3.
+****************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
+****************************************************************************/
+void inv_mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
+****************************************************************************/
+void inv_mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
+****************************************************************************/
+void inv_mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
+****************************************************************************/
+void inv_mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* Adds the tweakey (including the round constants) to the state.
+****************************************************************************/
+void add_tweakey(u32* state, const u32* rtk1, const u32* rtk2_3) {
+	state[0] ^= rtk1[0] ^ rtk2_3[0];
+	state[1] ^= rtk1[1] ^ rtk2_3[1]; 
+	state[2] ^= rtk1[2] ^ rtk2_3[2];
+	state[3] ^= rtk1[3] ^ rtk2_3[3];
+	state[4] ^= rtk1[4] ^ rtk2_3[4];
+	state[5] ^= rtk1[5] ^ rtk2_3[5];
+	state[6] ^= rtk1[6] ^ rtk2_3[6];
+	state[7] ^= rtk1[7] ^ rtk2_3[7];
+}
+
+/****************************************************************************
+* Encryption of 2 blocks in parallel using SKINNY-128-384.
+* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
+* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
+****************************************************************************/
+void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk) {
+	u32 state[8];
+	packing(state, ptext, ptext_bis);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
+	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
+	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
+	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
+	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
+	unpacking(ctext, ctext_bis, state);
+}
+
+/****************************************************************************
+* Decryption of 2 blocks in parallel using SKINNY-128-384.
+* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
+* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
+****************************************************************************/
+void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, 
+					const u8* ctext_bis, const tweakey tk) {
+	u32 state[8];
+	packing(state, ctext, ctext_bis);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	unpacking(ptext, ptext_bis, state);
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+#include "tk_schedule.h"
+
+void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk);
+
+void skinny128_384_plus_decrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk);
+
+#define SKINNY128_384_ROUNDS	40
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= (state[0] | state[1]);					\
+	state[7] ^= (state[4] | state[5]);					\
+	state[1] ^= (state[6] | state[5]);					\
+	state[2] ^= (state[3] & state[7]);					\
+	state[6] ^= (~state[7] | state[4]);					\
+	state[0] ^= (state[2] | ~state[1]);					\
+	state[4] ^= (~state[3] | state[2]);					\
+	state[5] ^= (state[6] & state[0]);					\
+	add_tweakey(state, rtk1, rtk2_3); 					\
+	mixcolumns_0(state);								\
+	state[4] ^= (state[2] | state[3]);					\
+	state[5] ^= (state[6] | state[1]);					\
+	state[3] ^= (state[0] | state[1]);					\
+	state[7] ^= (state[4] & state[5]);					\
+	state[0] ^= (~state[5] | state[6]);					\
+	state[2] ^= (state[7] | ~state[3]);					\
+	state[6] ^= (~state[4] | state[7]);					\
+	state[1] ^= (state[0] & state[2]);					\
+	add_tweakey(state, rtk1+8, rtk2_3+8); 				\
+	mixcolumns_1(state);								\
+	state[6] ^= (state[7] | state[4]);					\
+	state[1] ^= (state[0] | state[3]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	state[5] ^= (state[6] & state[1]);					\
+	state[2] ^= (~state[1] | state[0]);					\
+	state[7] ^= (state[5] | ~state[4]);					\
+	state[0] ^= (~state[6] | state[5]);					\
+	state[3] ^= (state[2] & state[7]);					\
+	add_tweakey(state, rtk1+16, rtk2_3+16); 			\
+	mixcolumns_2(state);								\
+	state[0] ^= (state[5] | state[6]);					\
+	state[3] ^= (state[2] | state[4]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	state[1] ^= (state[0] & state[3]);					\
+	state[7] ^= (~state[3] | state[2]);					\
+	state[5] ^= (state[1] | ~state[6]);					\
+	state[2] ^= (~state[0] | state[1]);					\
+	state[4] ^= (state[7] & state[5]);					\
+	add_tweakey(state, rtk1+24, rtk2_3+24); 			\
+	mixcolumns_3(state);								\
+	state[0] ^= state[1]; 								\
+	state[1] ^= state[0]; 								\
+	state[0] ^= state[1]; 								\
+	state[2] ^= state[3]; 								\
+	state[3] ^= state[2]; 								\
+	state[2] ^= state[3]; 								\
+	state[4] ^= state[7]; 								\
+	state[7] ^= state[4]; 								\
+	state[4] ^= state[7]; 								\
+	state[5] ^= state[6]; 								\
+	state[6] ^= state[5]; 								\
+	state[5] ^= state[6]; 								\
+})
+
+#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({		\
+	state[0] ^= state[1]; 								\
+	state[1] ^= state[0]; 								\
+	state[0] ^= state[1]; 								\
+	state[2] ^= state[3]; 								\
+	state[3] ^= state[2]; 								\
+	state[2] ^= state[3]; 								\
+	state[4] ^= state[7]; 								\
+	state[7] ^= state[4]; 								\
+	state[4] ^= state[7]; 								\
+	state[5] ^= state[6]; 								\
+	state[6] ^= state[5]; 								\
+	state[5] ^= state[6]; 								\
+	inv_mixcolumns_3(state);							\
+	add_tweakey(state, rtk1+24, rtk2_3+24); 			\
+	state[4] ^= (state[7] & state[5]);					\
+	state[2] ^= (~state[0] | state[1]);					\
+	state[5] ^= (state[1] | ~state[6]);					\
+	state[7] ^= (~state[3] | state[2]);					\
+	state[1] ^= (state[0] & state[3]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	state[3] ^= (state[2] | state[4]);					\
+	state[0] ^= (state[5] | state[6]);					\
+	inv_mixcolumns_2(state);							\
+	add_tweakey(state, rtk1+16, rtk2_3+16); 			\
+	state[3] ^= (state[2] & state[7]);					\
+	state[0] ^= (~state[6] | state[5]);					\
+	state[7] ^= (state[5] | ~state[4]);					\
+	state[2] ^= (~state[1] | state[0]);					\
+	state[5] ^= (state[6] & state[1]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	state[1] ^= (state[0] | state[3]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	inv_mixcolumns_1(state);							\
+	add_tweakey(state, rtk1+8, rtk2_3+8); 				\
+	state[1] ^= (state[0] & state[2]);					\
+	state[6] ^= (~state[4] | state[7]);					\
+	state[2] ^= (state[7] | ~state[3]);					\
+	state[0] ^= (~state[5] | state[6]);					\
+	state[7] ^= (state[4] & state[5]);					\
+	state[3] ^= (state[0] | state[1]);					\
+	state[5] ^= (state[6] | state[1]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	inv_mixcolumns_0(state); 							\
+	add_tweakey(state, rtk1, rtk2_3); 					\
+	state[5] ^= (state[6] & state[0]);					\
+	state[4] ^= (~state[3] | state[2]);					\
+	state[0] ^= (state[2] | ~state[1]);					\
+	state[6] ^= (~state[7] | state[4]);					\
+	state[2] ^= (state[3] & state[7]);					\
+	state[1] ^= (state[6] | state[5]);					\
+	state[7] ^= (state[4] | state[5]);					\
+	state[3] ^= (state[0] | state[1]);					\
+})
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c
+/*******************************************************************************
+* Implementation of the tweakey schedule according to the fixsliced 
+* representation.
+* 
+* For more details, see the paper at: https://
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+*******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "tk_schedule.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/****************************************************************************
+* The round constants according to the fixsliced representation.
+****************************************************************************/
+u32 rconst_32_bs[320] = {
+	0xfffffff3, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff, 
+	0xffffffff, 0x00000300, 0xcffffcff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xffcffffc, 0xffcfffff, 0xffcfffff, 
+	0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff, 
+	0xfcffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000c00,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c30, 0xfffcf3cf, 0xffffffff, 0xffffffcf, 
+	0xffffff03, 0xffffff3f, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff, 
+	0xfffffcff, 0x00000300, 0xcffffc3f, 0xfffffcff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xf3ffffff, 0x00000300,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x0c000000, 0xf3fffffc, 0xffcfffff, 0xffcfffff, 
+	0xffc3ffff, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff, 
+	0xffffffff, 0x03c00000, 0xfc3fcfff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000c00,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff33ff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000000, 0xfffcffcf, 0xffffffcf, 0xffffffcf, 
+	0xfffffff3, 0xffffff3f, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000f0, 0xffffff3f, 0xffffffff, 
+	0xfffffcff, 0x000000c0, 0xcffffc3f, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x0c000300,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x3ffffcff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xf3cffffc, 0xffffffff, 0xffcfffff, 
+	0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00300000, 0xffffffff, 0xffffffff, 
+	0xfcffffff, 0x00000000, 0xff3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
+	0xffffffff, 0x00000000, 0xffffffff, 0xffff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffff, 
+	0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff, 
+	0xffffffff, 0x00000000, 0xcffffcff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x3fffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xffcffffc, 0xffffffff, 0xffcfffff, 
+	0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xffffffff, 0xffffffff, 
+	0xfcffffff, 0x00000000, 0xfc3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xffff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffcf, 
+	0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff, 
+	0xffffffff, 0x00000300, 0xcffffc3f, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xfffffffc, 0xffcfffff, 0xffcfffff, 
+	0xff33ffff, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff, 
+	0xffffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x00000c00,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c00, 0xfffcffcf, 0xffffffff, 0xffffffcf
+};
+
+/****************************************************************************
+* Packs 2 input blocks B, B' into the state using a bitsliced representation.
+* Once the packing process is complete, the 256-bit state consists of 8 
+* 32-bit word and the input blocks bit positioning is as follows:
+*
+* 24 24' 56 56' 88 88' 120 120' | ... | 0 0' 32 32' 64 64' 96 96'
+* 25 25' 57 57' 89 89' 121 121' | ... | 1 1' 33 33' 65 65' 97 97'
+* 26 26' 58 58' 90 90' 122 122' | ... | 2 2' 34 34' 66 66' 98 98'
+* 27 27' 59 59' 91 91' 123 123' | ... | 3 3' 35 35' 67 67' 99 99'
+* 28 28' 60 60' 92 92' 124 124' | ... | 4 4' 36 36' 68 68' 100 100'
+* 29 29' 61 61' 93 93' 125 125' | ... | 5 5' 37 37' 69 69' 101 101'
+* 30 30' 62 62' 94 94' 126 126' | ... | 6 6' 38 38' 70 70' 102 102'
+* 31 31' 63 63' 95 95' 127 127' | ... | 7 7' 39 39' 71 71' 103 103'
+****************************************************************************/
+void packing(u32* out, const u8* block0, const u8* block1) {
+	u32 tmp;
+	LE_LOAD(out, block0);
+	LE_LOAD(out + 1, block1);
+	LE_LOAD(out + 2, block0 + 4);
+	LE_LOAD(out + 3, block1 + 4);
+	LE_LOAD(out + 4, block0 + 8);
+	LE_LOAD(out + 5, block1 + 8);
+	LE_LOAD(out + 6, block0 + 12);
+	LE_LOAD(out + 7, block1 + 12);
+	SWAPMOVE(out[1], out[0], 0x55555555, 1);
+	SWAPMOVE(out[3], out[2], 0x55555555, 1);
+	SWAPMOVE(out[5], out[4], 0x55555555, 1);
+	SWAPMOVE(out[7], out[6], 0x55555555, 1);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[4], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[6], out[0], 0x03030303, 6);
+	SWAPMOVE(out[3], out[1], 0x30303030, 2);
+	SWAPMOVE(out[5], out[1], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[7], out[1], 0x03030303, 6);
+	SWAPMOVE(out[4], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[6], out[2], 0x03030303, 4);
+	SWAPMOVE(out[5], out[3], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[7], out[3], 0x03030303, 4);
+	SWAPMOVE(out[6], out[4], 0x03030303, 2);
+	SWAPMOVE(out[7], out[5], 0x03030303, 2);
+}
+
+/****************************************************************************
+* Unacks the 256-bit state into the 32-byte output byte array.
+* Once the unpacking process is complete, the byte ordering within the output
+* array is as follows:
+*
+*  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,
+* 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  26,  27,  28,  29,  30,  31
+****************************************************************************/
+void unpacking(u8* out, u8* out_bis, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[6], in[4], 0x03030303, 2);
+	SWAPMOVE(in[7], in[5], 0x03030303, 2);
+	SWAPMOVE(in[5], in[3], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[7], in[3], 0x03030303, 4);
+	SWAPMOVE(in[4], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[6], in[2], 0x03030303, 4);
+	SWAPMOVE(in[7], in[1], 0x03030303, 6);
+	SWAPMOVE(in[5], in[1], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[3], in[1], 0x30303030, 2);
+	SWAPMOVE(in[6], in[0], 0x03030303, 6);
+	SWAPMOVE(in[4], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[1], in[0], 0x55555555, 1);
+	SWAPMOVE(in[3], in[2], 0x55555555, 1);
+	SWAPMOVE(in[5], in[4], 0x55555555, 1);
+	SWAPMOVE(in[7], in[6], 0x55555555, 1);
+	LE_STORE(out, in[0]);
+	LE_STORE(out_bis, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out_bis + 4, in[3]);
+	LE_STORE(out + 8, in[4]);
+	LE_STORE(out_bis + 8, in[5]);
+	LE_STORE(out + 12, in[6]);
+	LE_STORE(out_bis + 12, in[7]);
+}
+
+
+//Apply the permutation in a bitsliced manner, twice
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 4 times
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 6 times
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 8 times
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 10 times
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 12 times
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 14 times
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00; 	//red
+		tk[i] |= ROR(tmp,14) & 0x00cc0000; 	//green
+		tk[i] |= ROR(tmp,30) & 0xcc000000; 	//blue
+		tk[i] |= ROR(tmp,16) & 0x000000ff; 	//yellow
+		tk[i] |= ROR(tmp,18) & 0x33003300; 	//purp
+	}
+}
+
+void precompute_lfsr_tk2(u32* tk, const u8* tk2_0,
+						const u8* tk2_1, const int rounds) {
+	u32 tmp;
+	u32 state[8];
+	packing(state, tk2_0, tk2_1);
+	memcpy(tk, state, 32);
+	for(int i = 0 ; i < rounds; i+=2) {
+		LFSR2(state);
+		memcpy(tk+i*8+8, state, 32);
+	}
+}
+
+void precompute_lfsr_tk3(u32* tk, const u8* tk3_0,
+						const u8* tk3_1, const int rounds) {
+	u32 tmp;
+	u32 state[8];
+	packing(state, tk3_0, tk3_1);
+	for(int i = 0; i < 8; i++)
+		tk[i] ^= state[i];
+	for(int i = 0 ; i < rounds; i+=2) {
+		LFSR3(state);
+		tk[i*8+8] ^= state[0];
+		tk[i*8+9] ^= state[1];
+		tk[i*8+10] ^= state[2];
+		tk[i*8+11] ^= state[3];
+		tk[i*8+12] ^= state[4];
+		tk[i*8+13] ^= state[5];
+		tk[i*8+14] ^= state[6];
+		tk[i*8+15] ^= state[7];
+	}
+}
+
+/****************************************************************************
+* XOR with TK with TK1 before applying the permutations.
+* The key is then rearranged to match the fixsliced representation.
+****************************************************************************/
+void permute_tk(u32* tk, const u8* tk1_0, const u8* tk1_1, const int rounds) {
+	u32 test;
+	u32 tk1[8], tmp[8];
+	packing(tk1, tk1_0, tk1_1);
+	memcpy(tmp, tk, 32);
+	XOR_BLOCK(tmp, tk1);
+	tk[0] = tmp[6] & 0xf0f0f0f0; 			//mask to extract rows 1&2 only
+	tk[1] = tmp[5] & 0xf0f0f0f0;
+	tk[2] = tmp[0] & 0xf0f0f0f0;
+	tk[3] = tmp[1] & 0xf0f0f0f0;
+	tk[4] = tmp[3] & 0xf0f0f0f0;
+	tk[5] = tmp[7] & 0xf0f0f0f0;
+	tk[6] = tmp[4] & 0xf0f0f0f0;
+	tk[7] = tmp[2] & 0xf0f0f0f0;
+	for(int i = 0 ; i < rounds; i+=8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		memcpy(tmp, tk+i*8+8, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 						// applies P^2
+		else
+			permute_tk_10(tmp); 					// applies P^10
+		tk[i*8+8] = ROR(tmp[4],26) & 0xc3c3c3c3; 	//mask to extract rows 1&2 only
+		tk[i*8+9] = ROR(tmp[7],26) & 0xc3c3c3c3; 	//rotation to match fixslicing
+		tk[i*8+10] = ROR(tmp[6],26) & 0xc3c3c3c3;
+		tk[i*8+11] = ROR(tmp[5],26) & 0xc3c3c3c3;
+		tk[i*8+12] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*8+13] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*8+14] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*8+15] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*8+16] = ROR(tmp[3],28) & 0x03030303; 	//mask to extract rows 1&2 only
+		tk[i*8+16] |= ROR(tmp[3],12) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+17] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*8+17] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*8+18] = ROR(tmp[4],28) & 0x03030303;
+		tk[i*8+18] |= ROR(tmp[4],12) & 0x0c0c0c0c;
+		tk[i*8+19] = ROR(tmp[7],28) & 0x03030303;
+		tk[i*8+19] |= ROR(tmp[7],12) & 0x0c0c0c0c;
+		tk[i*8+20] = ROR(tmp[5],28) & 0x03030303;
+		tk[i*8+20] |= ROR(tmp[5],12) & 0x0c0c0c0c;
+		tk[i*8+21] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*8+21] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*8+22] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*8+22] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		tk[i*8+23] = ROR(tmp[6],28) & 0x03030303;
+		tk[i*8+23] |= ROR(tmp[6],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*8+24, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 						// applies P^4
+		else
+			permute_tk_12(tmp); 					// applies P^12
+		tk[i*8+24] = ROR(tmp[1],14) & 0x30303030; 	//mask to extract rows 1&2 only
+		tk[i*8+24] |= ROR(tmp[1],6) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+25] = ROR(tmp[0],14) & 0x30303030;
+		tk[i*8+25] |= ROR(tmp[0],6) & 0x0c0c0c0c;
+		tk[i*8+26] = ROR(tmp[3],14) & 0x30303030;
+		tk[i*8+26] |= ROR(tmp[3],6) & 0x0c0c0c0c;
+		tk[i*8+27] = ROR(tmp[2],14) & 0x30303030;
+		tk[i*8+27] |= ROR(tmp[2],6) & 0x0c0c0c0c;
+		tk[i*8+28] = ROR(tmp[7],14) & 0x30303030;
+		tk[i*8+28] |= ROR(tmp[7],6) & 0x0c0c0c0c;
+		tk[i*8+29] = ROR(tmp[6],14) & 0x30303030;
+		tk[i*8+29] |= ROR(tmp[6],6) & 0x0c0c0c0c;
+		tk[i*8+30] = ROR(tmp[5],14) & 0x30303030;
+		tk[i*8+30] |= ROR(tmp[5],6) & 0x0c0c0c0c;
+		tk[i*8+31] = ROR(tmp[4],14) & 0x30303030;
+		tk[i*8+31] |= ROR(tmp[4],6) & 0x0c0c0c0c;
+		tk[i*8+32] = ROR(tmp[6],16) & 0xf0f0f0f0; 	//mask to extract rows 1&2 only
+		tk[i*8+33] = ROR(tmp[5],16) & 0xf0f0f0f0; 	//rotation to match fixslicing
+		tk[i*8+34] = ROR(tmp[0],16) & 0xf0f0f0f0;
+		tk[i*8+35] = ROR(tmp[1],16) & 0xf0f0f0f0;
+		tk[i*8+36] = ROR(tmp[3],16) & 0xf0f0f0f0;
+		tk[i*8+37] = ROR(tmp[7],16) & 0xf0f0f0f0;
+		tk[i*8+38] = ROR(tmp[4],16) & 0xf0f0f0f0;
+		tk[i*8+39] = ROR(tmp[2],16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*8+40, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 						//	applies P^6
+		else
+			permute_tk_14(tmp); 					// applies P^14
+		tk[i*8+40] = ROR(tmp[4],10) & 0xc3c3c3c3; 	//mask to extract rows 1&2 only
+		tk[i*8+41] = ROR(tmp[7],10) & 0xc3c3c3c3; 	//rotation to match fixslicing
+		tk[i*8+42] = ROR(tmp[6],10) & 0xc3c3c3c3;
+		tk[i*8+43] = ROR(tmp[5],10) & 0xc3c3c3c3;
+		tk[i*8+44] = ROR(tmp[1],10) & 0xc3c3c3c3;
+		tk[i*8+45] = ROR(tmp[2],10) & 0xc3c3c3c3;
+		tk[i*8+46] = ROR(tmp[3],10) & 0xc3c3c3c3;
+		tk[i*8+47] = ROR(tmp[0],10) & 0xc3c3c3c3;
+		tk[i*8+48] = ROR(tmp[3],12) & 0x03030303; 	//mask to extract rows 1&2 only
+		tk[i*8+48] |= ROR(tmp[3],28) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+49] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*8+49] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*8+50] = ROR(tmp[4],12) & 0x03030303;
+		tk[i*8+50] |= ROR(tmp[4],28) & 0x0c0c0c0c;
+		tk[i*8+51] = ROR(tmp[7],12) & 0x03030303;
+		tk[i*8+51] |= ROR(tmp[7],28) & 0x0c0c0c0c;
+		tk[i*8+52] = ROR(tmp[5],12) & 0x03030303;
+		tk[i*8+52] |= ROR(tmp[5],28) & 0x0c0c0c0c;
+		tk[i*8+53] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*8+53] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*8+54] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*8+54] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		tk[i*8+55] = ROR(tmp[6],12) & 0x03030303;
+		tk[i*8+55] |= ROR(tmp[6],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*8+56, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 						// applies P^8
+		tk[i*8+56] = ROR(tmp[1],30) & 0x30303030; 	//mask to extract rows 1&2 only
+		tk[i*8+56] |= ROR(tmp[1],22) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+57] = ROR(tmp[0],30) & 0x30303030;
+		tk[i*8+57] |= ROR(tmp[0],22) & 0x0c0c0c0c;
+		tk[i*8+58] = ROR(tmp[3],30) & 0x30303030;
+		tk[i*8+58] |= ROR(tmp[3],22) & 0x0c0c0c0c;
+		tk[i*8+59] = ROR(tmp[2],30) & 0x30303030;
+		tk[i*8+59] |= ROR(tmp[2],22) & 0x0c0c0c0c;
+		tk[i*8+60] = ROR(tmp[7],30) & 0x30303030;
+		tk[i*8+60] |= ROR(tmp[7],22) & 0x0c0c0c0c;
+		tk[i*8+61] = ROR(tmp[6],30) & 0x30303030;
+		tk[i*8+61] |= ROR(tmp[6],22) & 0x0c0c0c0c;
+		tk[i*8+62] = ROR(tmp[5],30) & 0x30303030;
+		tk[i*8+62] |= ROR(tmp[5],22) & 0x0c0c0c0c;
+		tk[i*8+63] = ROR(tmp[4],30) & 0x30303030;
+		tk[i*8+63] |= ROR(tmp[4],22) & 0x0c0c0c0c;
+		//if (test && (i+8 < rounds)) { 				//only if next loop iteration
+		if (i+8 < rounds) { 						//only if next loop iteration
+			tk[i*8+64] = tmp[6] & 0xf0f0f0f0; 		//mask to extract rows 1&2 only
+			tk[i*8+65] = tmp[5] & 0xf0f0f0f0;
+			tk[i*8+66] = tmp[0] & 0xf0f0f0f0;
+			tk[i*8+67] = tmp[1] & 0xf0f0f0f0;
+			tk[i*8+68] = tmp[3] & 0xf0f0f0f0;
+			tk[i*8+69] = tmp[7] & 0xf0f0f0f0;
+			tk[i*8+70] = tmp[4] & 0xf0f0f0f0;
+			tk[i*8+71] = tmp[2] & 0xf0f0f0f0;
+		}
+	}
+}
+
+//Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3, int rounds) {
+	memset(rtk, 0x00, 32*rounds);
+	precompute_lfsr_tk2(rtk, tk2, tk2, rounds);
+	precompute_lfsr_tk3(rtk, tk3, tk3, rounds);
+	permute_tk(rtk, (u8*)(rtk+16), (u8*)(rtk+16), rounds);	// rtk+16 is NULL
+	for(int i = 0; i < rounds; i++) {						// add rconsts
+		for(int j = 0; j < 8; j++)
+			rtk[i*8+j] ^= rconst_32_bs[i*8+j];
+	}
+}
+
+//Precompute TK1
+void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis) {
+	memset(rtk1, 0x00, 32*16);
+	permute_tk(rtk1, tk1, tk1_bis, 16);
+}
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h
+#ifndef TK_SCHEDULE_BS_H_
+#define TK_SCHEDULE_BS_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+typedef struct {
+	u32 rtk1[8*16];
+	u32 rtk2_3[8*40];
+} tweakey;
+	
+void packing(u32* out, const u8* block0, const u8* block1);
+void unpacking(u8* out, u8* out_bis, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3, int rounds);
+void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
+
+#define LFSR2(tk) ({				\
+	tmp = (tk)[0] ^ (tk)[2];		\
+	(tk)[0] = (tk)[1]; 				\
+	(tk)[1] = (tk)[2];				\
+	(tk)[2] = (tk)[3];				\
+	(tk)[3] = (tk)[4];				\
+	(tk)[4] = (tk)[5];				\
+	(tk)[5] = (tk)[6];				\
+	(tk)[6] = (tk)[7];				\
+	(tk)[7] = tmp;					\
+})
+
+#define LFSR3(tk) ({				\
+	tmp = (tk)[7] ^ (tk)[1]; 		\
+	(tk)[7] = (tk)[6];				\
+	(tk)[6] = (tk)[5];				\
+	(tk)[5] = (tk)[4];				\
+	(tk)[4] = (tk)[3];				\
+	(tk)[3] = (tk)[2];				\
+	(tk)[2] = (tk)[1];				\
+	(tk)[1] = (tk)[0];				\
+	(tk)[0] = tmp;					\
+})
+
+#define XOR_BLOCK(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+	(x)[4] ^= (y)[4];				\
+	(x)[5] ^= (y)[5];				\
+	(x)[6] ^= (y)[6];				\
+	(x)[7] ^= (y)[7];				\
+})
+
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24; 
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#endif  // TK_SCHEDULE_BS_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[BLOCKBYTES], auth[BLOCKBYTES], sum [BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384(c, rtk2_3, m, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        mlen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update lfsr for next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (mlen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        for(i = 0; i < mlen; i++)
+            sum[i] ^= m[i];             // sum for tag computation
+        sum[i] ^= 0x80;                 // padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1); // encrypt 'auth' = 0^16
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];      // encrypted padded block
+        c += mlen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1);  // compute the tag
+    memcpy(c, sum, TAGBYTES);
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(sum, rtk2_3, ad, rtk1);   // use 'sum' as tmp array
+        xor_block(auth, sum);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] = 0x80;              // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(c, auth);                 // XOR for tag computation
+    // ----------------- Process the associated data -----------------
+    return 0;
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
+        skinny128_384_inv(m, rtk2_3, c, rtk1);
+        xor_block(sum, m);              // sum for tag computation
+        clen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update LFSR for the next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (clen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(auth, rtk2_3, auth, rtk1);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];      // encrypted padded block
+            sum[i] ^= m[i];             // sum for tag computation
+        }
+        sum[i] ^= 0x80;                 // padding
+        c += clen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    tkschedule_perm_tk1(rtk1, tmp);
+    skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp);
+        skinny128_384(tmp + BLOCKBYTES, rtk2_3, ad, rtk1);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384(tmp, rtk2_3, tmp, rtk1);
+        xor_block(auth, tmp);
+    }
+    xor_block(sum, auth);               // XOR for tag computation
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];      // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, const u32* rtk2_3, const u8* ptext, const u32* rtk1);
+extern void skinny128_384_inv(u8* ptext, const u32* rtk2_3, const u8* ctext, const u32* rtk1);
+extern void tkschedule_lfsr(u32* rtk2_3, const u8* tk2, const u8* tk3, const int rounds);
+extern void tkschedule_perm(u32* rtk2_3);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
+
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinny128.s
+/*******************************************************************************
+* ARM assembly implementation of fixsliced SKINNY-128-384.
+*
+* For more details, see the paper at: https://
+* 
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+*******************************************************************************/
+
+.syntax unified
+.thumb
+
+/*******************************************************************************
+* applies P^2 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p2:
+	movw 	r1, #0xcc00
+	movt 	r1, #0xcc00 				//r1 <- 0xcc00cc00
+	movw 	r10, #0xcc00
+	movt 	r10, #0x0033 				//r10<- 0xcc000033
+	and 	r11, r1, r6, ror #14
+	bfi 	r11, r6, #16, #8
+	and 	r12, r6, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r6
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r6, #0x00cc0000
+	orr 	r6, r11, r12, lsr #18
+	and 	r11, r1, r7, ror #14
+	bfi 	r11, r7, #16, #8
+	and 	r12, r7, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r7
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r7, #0x00cc0000
+	orr 	r7, r11, r12, lsr #18
+	and 	r11, r1, r8, ror #14
+	bfi 	r11, r8, #16, #8
+	and 	r12, r8, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r8
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r8, #0x00cc0000
+	orr 	r8, r11, r12, lsr #18
+	and 	r11, r1, r9, ror #14
+	bfi 	r11, r9, #16, #8
+	and 	r12, r9, #0xcc000000
+	orr 	r11, r11, r12, lsr #2
+	and 	r12, r10, r9
+	orr 	r11, r11, r12, lsr #8
+	and 	r12, r9, #0x00cc0000
+	orr 	r9, r11, r12, lsr #18
+	bx 		lr
+
+/*******************************************************************************
+* applies P^4 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p4:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0x00cc
+	movt 	r14, #0xcc00 				//r14<- 0xcc0000cc
+	movw 	r12, #0xcc00
+	movt 	r12, #0x3300 				//r12<- 0x3300cc00
+	movw 	r11, #0x00cc
+	movt 	r11, #0x00cc 				//r11<- 0x00cc00cc
+ 	and 	r10, r14, r6, ror #22
+ 	and 	r1, r12, r6, ror #16
+ 	orr 	r10, r10,  r1
+ 	and 	r1, r6, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r6, r6, r1
+ 	orr 	r6, r10, r6, ror #24
+ 	and 	r10, r14, r7, ror #22
+ 	and 	r1, r12, r7, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r7, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r7, r7, r1
+ 	orr 	r7, r10, r7, ror #24
+ 	and 	r10, r14, r8, ror #22
+ 	and 	r1, r12, r8, ror #16
+ 	orr 	r10, r10, r1
+ 	and 	r1, r8, r11
+ 	orr 	r10, r10, r1, lsr #2
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r8, r8, r1
+ 	orr 	r8, r10, r8, ror #24
+ 	and 	r10, r14, r9, ror #22
+ 	ldr.w 	r14, [sp] 					//restore r14
+ 	and 	r12, r12, r9, ror #16
+ 	orr 	r10, r10, r12
+ 	and 	r12, r9, r11
+ 	orr 	r10, r10, r12, lsr #2
+	movw 	r12, #0xcc33 				//r1 <- 0x0000cc33
+ 	and 	r9, r9, r12
+ 	orr 	r9, r10, r9, ror #24
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^6 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p6:
+	movw 	r1, #0x3333 				//r1 <- 0x00003333
+	movw 	r12, #0x00cc
+	movt 	r12, #0x3300 				//r12<- 0x330000cc
+	and 	r10, r6, r1, ror #8 		// --- permute r6 6 times
+	and 	r11, r12, r6, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r6, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r6, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r6, #0x00003300
+	orr 	r6, r11, r10, lsl #2 		// permute r6 6 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r7 6 times
+	and 	r11, r12, r7, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r7, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r7, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r7, #0x00003300
+	orr 	r7, r11, r10, lsl #2 		// permute r7 6 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r8 6 times
+	and 	r11, r12, r8, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r8, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r8, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r8, #0x00003300
+	orr 	r8, r11, r10, lsl #2 		// permute r8 6 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r9 6 times
+	and 	r11, r12, r9, ror #24
+	orr 	r11, r11, r10, ror #6
+	and 	r10, r1, r9, ror #10
+	orr 	r11, r11, r10
+	and 	r10, r9, #0x000000cc
+	orr 	r11, r11, r10, lsl #14
+	and 	r10, r9, #0x00003300 		// permute r9 6 times ---
+	orr 	r9, r11, r10, lsl #2
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^8 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p8:
+	movw 	r12, #0x3333 				//r12<- 0x00003333
+	movw 	r1, #0x0000
+	movt 	r1, #0x33cc 				//r1 <- 0x33cc0000
+	and 	r10, r6, r1 				// --- permute r6 8 times
+	and 	r11, r1, r6, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r6, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r6, r12, lsl #8
+	orr 	r6, r11, r10, lsr #6 		// permute r6 8 times ---
+	and 	r10, r7, r1 				// --- permute r7 8 times
+	and 	r11, r1, r7, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r7, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r7, r12, lsl #8
+	orr 	r7, r11, r10, lsr #6 		// permute r7 8 times ---
+	and 	r10, r8, r1 				// --- permute r8 8 times
+	and 	r11, r1, r8, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r8, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r8, r12, lsl #8
+	orr 	r8, r11, r10, lsr #6 		// permute r8 8 times ---
+	and 	r10, r9, r1 				// --- permute r9 8 times
+	and 	r11, r1, r9, ror #8
+	orr 	r11, r11, r10, ror #24
+	and 	r10, r9, r12, lsl #2
+	orr 	r11, r11, r10, ror #26
+	and 	r10, r9, r12, lsl #8
+	orr 	r9, r11, r10, lsr #6 		// permute r9 8 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^10 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p10:
+	movw 	r12, #0x0033
+	movt 	r12, #0x3300 				//r12<- 0x33000033
+	movw 	r1, #0xcc33 				//r1 <- 0x0000cc33
+	and 	r10, r6, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r6, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r6, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r6, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r6, #0x0000cc00
+	orr 	r6, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r7, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r7, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r7, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r7, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r7, #0x0000cc00
+	orr 	r7, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r8, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r8, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r8, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r8, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r8, #0x0000cc00
+	orr 	r8, r11, r10, lsr #2 		// permute r6 10 times ---
+	and 	r10, r9, r1, ror #8 		// --- permute r6 10 times
+	and 	r11, r12, r9, ror #26
+	orr 	r11, r11, r10, ror #8
+	and 	r10, r9, r12, ror #24
+	orr 	r11, r11, r10, ror #22
+	and 	r10, r9, #0x00330000
+	orr 	r11, r11, r10, lsr #14
+	and 	r10, r9, #0x0000cc00
+	orr 	r9, r11, r10, lsr #2 		// permute r6 10 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^12 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p12:
+	str.w 	r14, [sp] 					//store r14 on the stack
+	movw 	r14, #0xcc33 				//r14<- 0x0000cc33
+	movw 	r12, #0x00cc
+	movt 	r12, #0x00cc 				//r12<- 0x00cc00cc
+	movw 	r1, #0x3300
+	movt 	r1, #0xcc00 				//r1 <- 0xcc003300
+	and 	r10, r14, r6, ror #8 		// --- permute r6 12 times
+	and 	r11, r12, r6, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r6, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r6, r10, ror #8
+	orr 	r6, r11, r10, ror #10 		// permute r6 12 times ---
+	and 	r10, r14, r7, ror #8 		// --- permute r7 12 times
+	and 	r11, r12, r7, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r7, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r7, r10, ror #8
+	orr 	r7, r11, r10, ror #10 		// permute r7 12 times ---
+	and 	r10, r14, r8, ror #8 		// --- permute r8 12 times
+	and 	r11, r12, r8, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r8, ror #16
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r8, r10, ror #8
+	orr 	r8, r11, r10, ror #10 		// permute r8 12 times ---
+	and 	r10, r14, r9, ror #8 		// --- permute r9 12 times
+	and 	r11, r12, r9, ror #30
+	orr 	r11, r11, r10
+	and 	r10, r1, r9, ror #16
+	ldr.w 	r14, [sp]
+	orr 	r11, r11, r10
+	movw 	r10, #0xcccc 				//r10<- 0x0000cccc
+	and 	r10, r9, r10, ror #8
+	orr 	r9, r11, r10, ror #10 		// permute r9 12 times ---
+ 	bx 		lr
+
+/*******************************************************************************
+* applies P^14 on the tweakey state in a bitsliced manner
+*******************************************************************************/
+.align 	2
+p14:
+	movw 	r1, #0xcc00
+	movt 	r1, #0x0033 				//r1 <- 0x0033cc00
+	movw 	r12, #0xcc00
+	movt 	r12, #0xcc00 				//r12<- 0x33003300
+	and 	r10, r1, r6, ror #24 		// --- permute r6 14 times
+	and 	r11, r6, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r6, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r6, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r6, r12
+	orr 	r6, r11, r10, ror #18 		// permute r6 14 times ---
+	and 	r10, r1, r7, ror #24 		// --- permute r7 14 times
+	and 	r11, r7, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r7, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r7, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r7, r12
+	orr 	r7, r11, r10, ror #18 		// permute r7 14 times ---
+	and 	r10, r1, r8, ror #24 		// --- permute r8 14 times
+	and 	r11, r8, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r8, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r8, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r8, r12
+	orr 	r8, r11, r10, ror #18 		// permute r8 14 times ---
+	and 	r10, r1, r9, ror #24 		// --- permute r9 14 times
+	and 	r11, r9, #0x00000033
+	orr 	r11, r10, r11, ror #14
+	and 	r10, r9, #0x33000000
+	orr 	r11, r11, r10, ror #30
+	and 	r10, r9, #0x00ff0000
+	orr 	r11, r11, r10, ror #16
+	and 	r10, r9, r12
+	orr 	r9, r11, r10, ror #18 		// permute r9 14 times ---
+ 	bx 		lr
+
+.align 2
+packing:
+	eor 	r12, r2, r2, lsr #3
+	and 	r12, r12, r10
+	eor 	r2, r2, r12
+	eor 	r2, r2, r12, lsl #3 		//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	eor 	r12, r3, r3, lsr #3
+	and 	r12, r12, r10
+	eor 	r3, r3, r12
+	eor 	r3, r3, r12, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r12, r4, r4, lsr #3
+	and 	r12, r12, r10
+	eor 	r4, r4, r12
+	eor 	r4, r4, r12, lsl #3 		//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r12, r5, r5, lsr #3
+	and 	r12, r12, r10
+	eor 	r5, r5, r12
+	eor 	r5, r5, r12, lsl #3 		//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r12, r2, r4, lsr #2
+	and 	r12, r12, r11
+	eor 	r2, r2, r12
+	eor 	r4, r4, r12, lsl #2 		//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r12, r2, r3, lsr #4
+	and 	r12, r12, r11, lsr #2
+	eor 	r2, r2, r12
+	eor 	r3, r3, r12, lsl #4 		//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r12, r2, r5, lsr #6
+	and 	r12, r12, r11, lsr #4
+	eor 	r2, r2, r12
+	eor 	r5, r5, r12, lsl #6 		//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r12, r4, r3, lsr #2
+	and 	r12, r12, r11, lsr #2
+	eor 	r4, r4, r12
+	eor 	r3, r3, r12, lsl #2 		//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r12, r4, r5, lsr #4
+	and 	r12, r12, r11, lsr #4
+	eor 	r4, r4, r12
+	eor 	r5, r5, r12, lsl #4 		//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r12, r3, r5, lsr #2
+	and 	r12, r12, r11, lsr #4
+	eor 	r3, r3, r12
+	eor 	r5, r5, r12, lsl #2 		//SWAPMOVE(r5, r3, 0x03030303, 2)
+	bx 		lr
+
+.align 2
+unpacking:
+	movw 	r6, #0x0a0a
+	movt 	r6, #0x0a0a 			//r6 <- 0x0a0a0a0a
+	eor 	r10, r3, r5, lsr #2
+	and 	r10, r10, r7, lsr #4
+	eor 	r3, r3, r10
+	eor 	r5, r5, r10, lsl #2 	//SWAPMOVE(r5, r3, 0x03030303, 2)
+	eor 	r10, r4, r5, lsr #4
+	and 	r10, r10, r7, lsr #4
+	eor 	r4, r4, r10
+	eor 	r5, r5, r10, lsl #4 	//SWAPMOVE(r5, r4, 0x03030303, 4)
+	eor 	r10, r4, r3, lsr #2
+	and 	r10, r10, r7, lsr #2
+	eor 	r4, r4, r10
+	eor 	r3, r3, r10, lsl #2 	//SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
+	eor 	r10, r2, r5, lsr #6
+	and 	r10, r10, r7, lsr #4
+	eor 	r2, r2, r10
+	eor 	r5, r5, r10, lsl #6 	//SWAPMOVE(r5, r2, 0x03030303, 6)
+	eor 	r10, r2, r3, lsr #4
+	and 	r10, r10, r7, lsr #2
+	eor 	r2, r2, r10
+	eor 	r3, r3, r10, lsl #4 	//SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
+	eor 	r10, r2, r4, lsr #2
+	and 	r10, r10, r7
+	eor 	r2, r2, r10
+	eor 	r4, r4, r10, lsl #2 	//SWAPMOVE(r4, r2, 0x30303030, 2)
+	eor 	r10, r5, r5, lsr #3
+	and 	r10, r10, r6
+	eor 	r5, r5, r10
+	eor 	r5, r5, r10, lsl #3 	//SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
+	eor 	r10, r4, r4, lsr #3
+	and 	r10, r10, r6
+	eor 	r4, r4, r10
+	eor 	r4, r4, r10, lsl #3 	//SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
+	eor 	r10, r3, r3, lsr #3
+	and 	r10, r10, r6
+	eor 	r3, r3, r10
+	eor 	r3, r3, r10, lsl #3 		//SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
+	eor 	r10, r2, r2, lsr #3
+	and 	r10, r10, r6
+	eor 	r2, r2, r10
+	eor 	r2, r2, r10, lsl #3 	//SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
+	bx 		lr
+
+/******************************************************************************
+* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
+* Performing both at the same time allows to save some memory accesses.
+******************************************************************************/
+@ void 	tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
+.global tkschedule_lfsr
+.type   tkschedule_lfsr,%function
+.align	2
+tkschedule_lfsr:
+	push 	{r0-r12, r14}
+	ldr.w 	r3, [r1, #8] 				//load tk2 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk2 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk2 (4th word)
+	ldr.w 	r12, [r1] 					//load tk2 (1st word)
+	mov 	r1, r2 						//move tk3 address in r1
+	mov 	r2, r12 					//move 1st tk2 word in r2
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk2
+	mov 	r6, r2 						//move tk2 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk2 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk2 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk2 from r2-r5 to r6-r9
+	ldr.w 	r3, [r1, #8] 				//load tk3 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk3 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk3 (4th) word)
+	ldr.w 	r2, [r1] 					//load tk3 (1st) word)
+	bl 		packing 					//pack tk3
+	eor 	r10, r10, r10, lsl #4 		//r10<- 0xaaaaaaaa
+	ldr.w 	r1, [sp, #12] 				//load loop counter in r1
+	eor 	r11, r2, r6 				//tk2 ^ tk3 (1st word)
+	eor 	r12, r3, r7 				//tk2 ^ tk3 (2nd word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	eor 	r11, r4, r8 				//tk2 ^ tk3 (3rd word)
+	eor 	r12, r5, r9					//tk2 ^ tk3 (4th word)
+	strd 	r11, r12, [r0], #8 			//store in tk
+	loop:
+		and 	r12, r8, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r6
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r6, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r3, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r5, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r5, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r2, r8 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r4, r6				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r9, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r7
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r7, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r2, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r4, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r4, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r5, r9 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r3, r7				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r6, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r8
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r8, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r5, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r3, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r3, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r3, r9 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r4, r6 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r5, r7 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r2, r8				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		and 	r12, r7, r10 			// --- apply LFSR2 to tk2
+		eor 	r12, r12, r9
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10 				
+		orr 	r9, r14, r12, lsr #1	// apply LFSR2 to tk2 ---
+		and 	r12, r4, r10 			// --- apply LFSR3 to tk3
+		eor 	r12, r2, r12, lsr #1
+		and 	r14, r10, r12, lsl #1
+		and 	r12, r12, r10
+		orr 	r2, r14, r12, lsr #1 	// apply LFSR3 to tk3 ---
+		eor 	r11, r2, r6 			//tk2 ^ tk3 (1st word)
+		eor 	r12, r3, r7 			//tk2 ^ tk3 (2nd word)
+		strd 	r11, r12, [r0], #8 		//store in tk
+		eor 	r11, r4, r8 			//tk2 ^ tk3 (3rd word)
+		eor 	r12, r5, r9				//tk2 ^ tk3 (4th word)
+		strd 	r11, r12, [r0], #24 	//store in tk
+		subs.w 	r1, r1, #8 				//decrease loop counter by 8
+		bne 	loop
+	pop 	{r0-r12, r14}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutation P and add the round constants to all round tweakeys.
+******************************************************************************/
+@ void 	tkschedule_perm(u32* tk)
+.global tkschedule_perm
+.type   tkschedule_perm,%function
+.align	2
+tkschedule_perm:
+	push 	{r0-r12, lr}
+	sub.w 	sp, #4 						//to store r14 in subroutines
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r6, r6, r10 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r7, r7, r10 				//tk &= 0xf0f0f0f0 (2nd word)
+	and 	r8, r8, r10 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r9, r9, r10 				//tk &= 0xf0f0f0f0 (4th word)
+	eor 	r8, r8, #0x00000004 		//add rconst
+	eor 	r9, r9, #0x00000040 		//add rconst
+	mvn 	r9, r9 						//to remove a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 1st round
+	strd 	r6, r7, [r0], #8  			//store 2nd half tk for 1st round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 3rd round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 3rd round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 5th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 5th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 4th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 4th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 7th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 7th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 9th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000054 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 9th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000410
+	eor 	r9, r9, #0x00000410
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 8th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 8th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x04000000 		//add rconst
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 11th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 11th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 13th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00140000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 13th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 12th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 12th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add const
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 15th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 15th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 17th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 17th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 16th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 16th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 18th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 18th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 19th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 19th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 21th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00100000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 21th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 20th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 20th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 22th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 22th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 23th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 23th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 25th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 25th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 24th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 24th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 26th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 26th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x40000000 		//add rconst
+	eor 	r9, r9, #0x04000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 27th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 27th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 29th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 29th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 28th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 28th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01000000 		//add rconst
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 30th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 30th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x01000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 31th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 31th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 33th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000050 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 33th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6 ,r6, #0x00000010
+	eor 	r8, r8, #0x00010400
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 32th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 32th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 34th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000140 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 34th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 35th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 35th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 37th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00440000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 37th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	eor 	r9, r9, #0x00100000 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 36th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 36th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	eor 	r12, r12, #0x01000000 		//add rconst
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 38th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 38th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000400 		//add rconst
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 39th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 39th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r12, r12, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 41th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000014 		//add rconst
+	eor 	r12, r12, #0x00000010 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 41th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	eor 	r9, r9, #0x00000400
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 40th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 40th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p10 						//apply the permutation 10 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000100 		//add rconst
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 42th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000040 		//add rconst
+	eor 	r12, r12, #0x00000100 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 42th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r9, r9, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 43th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 43th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p12 						//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 45th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00040000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 45th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 44th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 44th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p14 						//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 46th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x01400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 46th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r8, r8, #0x01000000 		//add rconst
+	eor 	r8, r8, #0x00004400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 47th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 47th round
+	ldm 	r0, {r6-r9} 				//load tk
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6 				//ror and mask to match fixslicing
+	and 	r12, r10, r7 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000040 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 49th round
+	and 	r11, r10, r8 				//ror and mask to match fixslicing
+	and 	r12, r10, r9 				//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00000004 		//add rconst
+	eor 	r12, r12, #0x00000040 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 49th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r7 ,r7, #0x00000010
+	eor 	r8, r8, #0x00010000
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 48th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 48th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p2 							//apply the permutation twice
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 50th round
+	and 	r11, r10, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #26 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x10000000 		//add rconst
+	eor 	r11, r11, #0x00000100 		//add rconst
+	eor 	r12, r12, #0x00000140 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 50th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #12
+	and 	r11, r10, r7, ror #28
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #12
+	and 	r11, r10, r8, ror #28
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #12
+	and 	r11, r10, r9, ror #28
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #12		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x04000000 		//add rconst
+	eor 	r8, r8, #0x44000000 		//add rconst
+	eor 	r8, r8, #0x00000100 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 51th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 51th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p4 							//apply the permutation 4 times
+	movw 	r10, #0xf0f0
+	movt 	r10, #0xf0f0 				//r10<- 0xf0f0f0f0
+	and 	r11, r10, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00400000 		//add rconst
+	strd 	r11, r12, [r0, #24] 		//store 2nd half tk for 53th round
+	and 	r11, r10, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #16 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x00040000 		//add rconst
+	eor 	r12, r12, #0x00500000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0, #16] 		//store 1st half tk for 53th round
+	and 	r10, r10, r10, lsr #2 		//r10<- 0x30303030
+	and 	r11, r10, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #6
+	and 	r11, r10, r7, ror #14
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #6
+	and 	r11, r10, r8, ror #14
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #6
+	and 	r11, r10, r9, ror #14
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #6			//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00100000 		//add rconst
+	eor 	r7, r7, #0x00100000 		//add rconst
+	eor 	r8, r8, #0x00000001 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 52th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 52th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p6 							//apply the permutation 6 times
+	movw 	r10, #0xc3c3
+	movt 	r10, #0xc3c3 				//r10<- 0xc3c3c3c3
+	and 	r11, r10, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 54th round
+	and 	r11, r10, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r10, r9, ror #10 		//ror and mask to match fixslicing
+	eor 	r11, r11, #0x01400000 		//add rconst
+	eor 	r11, r11, #0x00001000 		//add rconst
+	eor 	r12, r12, #0x00400000 		//add rconst
+	mvn 	r12, r12 					//to save a NOT in sbox calculations
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 54th round
+	and 	r10, r10, r10, lsr #6 		//r10<- 0x03030303
+	and 	r11, r10, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, lsl #6
+	orr 	r6, r11, r6, ror #28
+	and 	r11, r10, r7, ror #12
+	and 	r7, r7, r10, lsl #6
+	orr 	r7, r11, r7, ror #28
+	and 	r11, r10, r8, ror #12
+	and 	r8, r8, r10, lsl #6
+	orr 	r8, r11, r8, ror #28
+	and 	r11, r10, r9, ror #12
+	and 	r9, r9, r10, lsl #6
+	orr 	r9, r11, r9, ror #28		//ror and masks to match fixslicing ---
+	eor 	r7, r7, #0x00000400 		//add rconst
+	eor 	r8, r8, #0x01000000
+	eor 	r8, r8, #0x00004000 		//add rconst
+	eor 	r9, r9, #0x00000400 		//add rconst
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r8, r9, [r0], #8 			//store 1st half tk for 55th round
+	strd 	r6, r7, [r0], #8 			//store 2nd half tk for 55th round
+	ldm 	r0, {r6-r9} 				//load tk
+	bl 		p8 							//apply the permutation 8 times
+	movw 	r10, #0x3030
+	movt 	r10, #0x3030 				//r10<- 0x30303030
+	and 	r11, r10, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r6, r6, r10, ror #4
+	orr 	r6, r11, r6, ror #22
+	and 	r11, r10, r7, ror #30
+	and 	r7, r7, r10, ror #4
+	orr 	r7, r11, r7, ror #22
+	and 	r11, r10, r8, ror #30
+	and 	r8, r8, r10, ror #4
+	orr 	r8, r11, r8, ror #22
+	and 	r11, r10, r9, ror #30
+	and 	r9, r9, r10, ror #4
+	orr 	r9, r11, r9, ror #22		//ror and masks to match fixslicing ---
+	eor 	r6, r6, #0x00000010
+	eor 	r8, r8, #0x00010000
+	eor 	r8, r8, #0x00000010
+	mvn 	r9, r9 						//to save a NOT in sbox calculations
+	strd 	r6, r7, [r0], #8 			//store 1st half tk for 56th round
+	strd 	r8, r9, [r0], #24 			//store 2nd half tk for 56th round
+	add.w 	sp, #4
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+/******************************************************************************
+* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
+* don't need more calculations as no LFSR is applied to TK1.
+******************************************************************************/
+@ void 	tkschedule_perm_tk1(u32* tk, const u8* key)
+.global tkschedule_perm_tk1
+.type   tkschedule_perm_tk1,%function
+.align	2
+tkschedule_perm_tk1:
+	push 	{r0-r12, lr}
+	ldr.w 	r3, [r1, #8] 				//load tk1 (3rd word)
+	ldr.w 	r4, [r1, #4] 				//load tk1 (2nd word)
+	ldr.w 	r5, [r1, #12] 				//load tk1 (4th word)
+	ldr.w 	r2, [r1] 					//load tk1 (1st word)
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 				//r6 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 				//r7 <- 0x30303030
+	bl 		packing 					//pack tk1
+	mov 	r6, r2 						//move tk1 from r2-r5 to r6-r9
+	mov 	r7, r3 						//move tk1 from r2-r5 to r6-r9
+	mov 	r8, r4 						//move tk1 from r2-r5 to r6-r9
+	mov 	r9, r5 						//move tk1 from r2-r5 to r6-r9
+	movw 	r2, #0xf0f0
+	movt 	r2, #0xf0f0 				//r2<- 0xf0f0f0f0
+	and 	r11, r8, r2 				//tk &= 0xf0f0f0f0 (3rd word)
+	and 	r12, r9, r2 				//tk &= 0xf0f0f0f0 (4th word)
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 1st round
+	and 	r11, r6, r2 				//tk &= 0xf0f0f0f0 (1st word)
+	and 	r12, r7, r2 				//tk &= 0xf0f0f0f0 (2nd word)
+	strd 	r11, r12, [r0], #8  		//store 2nd half tk for 1st round
+	movw 	r3, #0x3030
+	movt 	r3, #0x3030 				//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0, #224]
+	and 	r11, r3, r7, ror #30
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0, #228]
+	and 	r11, r3, r8, ror #30
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0, #232]
+	and 	r11, r3, r9, ror #30
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #22		//ror and masks to match fixslicing ---
+	str.w 	r12, [r0, #236]
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3 <- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 2nd round
+	and 	r11, r3, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 2nd round
+	and 	r3, r3, r3, lsr #6 			//r3<- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 4 times
+	lsl 	r3, r3, #4 					//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r7, ror #14
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r8, ror #14
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r9, ror #14
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #6			//ror and masks to match fixslicing ---
+	str.w 	r12, [r0], #4
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation twice
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3<- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 6th round
+	and 	r11, r3, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 6th round
+	and 	r3, r3, r3, lsr #6 			//r3<- 0x03030303
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 8 times
+	lsl 	r3, r3, #4 					//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #30 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r7, ror #30
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r8, ror #30
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #22
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r9, ror #30
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #22		//ror and masks to match fixslicing ---
+	str.w 	r12, [r0], #4
+	and 	r11, r2, r6 				//ror and mask to match fixslicing
+	and 	r12, r2, r7 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 9th round
+	and 	r11, r2, r8 				//ror and mask to match fixslicing
+	and 	r12, r2, r9 				//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 9th round
+	bl 		p2 							//apply the permutation 10
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3 <- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 1st half tk for 10th round
+	and 	r11, r3, r8, ror #26 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #26 		//ror and mask to match fixslicing
+	strd	r11, r12, [r0], #8 			//store 2nd half tk for 10th round
+	and 	r3, r3, r3, lsr #6 			//r3 <- 0x03030303
+	and 	r11, r3, r6, ror #28 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #28
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #28
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #28
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #12
+	str.w 	r12, [r0], #16				//ror and masks to match fixslicing ---
+	bl 		p2 							//apply the permutation 12 times
+	lsl 	r3, r3, #4 					//r3 <- 0x30303030
+	and 	r11, r3, r6, ror #14 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r7, ror #14
+	and 	r12, r7, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r8, ror #14
+	and 	r12, r8, r3, ror #4
+	orr 	r12, r11, r12, ror #6
+	str.w 	r12, [r0], #4
+	and 	r11, r3, r9, ror #14
+	and 	r12, r9, r3, ror #4
+	orr 	r12, r11, r12, ror #6		//ror and masks to match fixslicing ---
+	str.w 	r12, [r0], #4
+	and 	r11, r2, r6, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r7, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0, #8] 			//store 2nd half tk for 5th round
+	and 	r11, r2, r8, ror #16 		//ror and mask to match fixslicing
+	and 	r12, r2, r9, ror #16 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #16 		//store 1st half tk for 5th round
+	bl 		p2 							//apply the permutation 14 times
+	movw 	r3, #0xc3c3
+	movt 	r3, #0xc3c3 				//r3 <- 0xc3c3c3c3
+	and 	r11, r3, r6, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r7, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 1st half tk for 14th round
+	and 	r11, r3, r8, ror #10 		//ror and mask to match fixslicing
+	and 	r12, r3, r9, ror #10 		//ror and mask to match fixslicing
+	strd 	r11, r12, [r0], #8 			//store 2nd half tk for 14th round
+	and 	r3, r3, r3, lsr #6 			//r3 <- 0x03030303
+	and 	r11, r3, r6, ror #12 		//--- ror and masks to match fixslicing
+	and 	r12, r6, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #8]
+	and 	r11, r3, r7, ror #12
+	and 	r12, r7, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #12]
+	and 	r11, r3, r9, ror #12
+	and 	r12, r9, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0, #4]
+	and 	r11, r3, r8, ror #12
+	and 	r12, r8, r3, lsl #6
+	orr 	r12, r11, r12, ror #28
+	str.w 	r12, [r0], #16 				//ror and masks to match fixslicing ---
+	pop 	{r0-r12, lr}
+	bx 		lr
+
+.align 2
+quadruple_round:
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #30 	// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #16		// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rtk_2_3 + rconst
+	eor 	r3, r3, r9 				//add rtk_2_3 + rconst
+	eor 	r4, r4, r10 			//add rtk_2_3 + rconst
+	eor 	r5, r5, r11 			//add rtk_2_3 + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #10		// --- mixcolumns 2 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #26
+	eor 	r2, r2, r8
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	ldmia.w r1!, {r8-r11} 			//load rkeys in r8,...,r11
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldmia.w r0!,{r8-r11}
+	eor 	r2, r2, r8 				//add rtk_1
+	eor 	r3, r3, r9 				//add rtk_1
+	eor 	r4, r4, r10 			//add rtk_1
+	eor 	r5, r5, r11 			//add rtk_1
+	and 	r8, r7, r2, ror #4		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	bx 		lr
+
+/******************************************************************************
+* Inverse quadruple round of fixsliced SKINNY-128.
+******************************************************************************/
+.align 	2
+inv_quadruple_round:
+	and 	r8, r7, r2, ror #4 		// --- mixcolumns 3 ---
+	eor 	r2, r2, r8, ror #22
+	and 	r8, r7, r2
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #4
+	eor 	r2, r2, r8, ror #26
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #22
+	and 	r8, r7, r3
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #4
+	eor 	r3, r3, r8, ror #26
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #22
+	and 	r8, r7, r4
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #4
+	eor 	r4, r4, r8, ror #26
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #22
+	and 	r8, r7, r5
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #4
+	eor 	r5, r5, r8, ror #26
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	and 	r8, r7, r2, ror #26 	// --- mixcolumns 2 ---
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #6
+	eor 	r2, r2, r8, ror #6
+	and 	r8, r7, r2, ror #10
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r3, ror #26
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #6
+	and 	r8, r7, r3, ror #10
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r4, ror #26
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #6
+	and 	r8, r7, r4, ror #10
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r5, ror #26
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #6
+	and 	r8, r7, r5, ror #10
+	eor 	r5, r5, r8, ror #4
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rk2_3 + rconst
+	eor 	r3, r3, r9 				//add rk2_3 + rconst
+	eor 	r4, r4, r10 			//add rk2_3 + rconst
+	eor 	r5, r5, r11 			//add rk2_3 + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	and 	r8, r7, r2, ror #16 	// --- mixcolumns 1 ---
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #28
+	eor 	r2, r2, r8
+	and 	r8, r7, r2, ror #16
+	eor 	r2, r2, r8, ror #30
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #28
+	eor 	r3, r3, r8
+	and 	r8, r7, r3, ror #16
+	eor 	r3, r3, r8, ror #30
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #28
+	eor 	r4, r4, r8
+	and 	r8, r7, r4, ror #16
+	eor 	r4, r4, r8, ror #30
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #28
+	eor 	r5, r5, r8
+	and 	r8, r7, r5, ror #16
+	eor 	r5, r5, r8, ror #30
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r3, r4
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8 				//SWAPMOVE(r4, r3, 0x55555555, 0);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	and 	r8, r7, r2, ror #6 		// --- mixcolumns 0 ---
+	eor 	r2, r2, r8, ror #4
+	and 	r8, r7, r2, ror #18
+	eor 	r2, r2, r8, ror #2
+	and 	r8, r7, r2, ror #30
+	eor 	r2, r2, r8, ror #24
+	and 	r8, r7, r3, ror #6
+	eor 	r3, r3, r8, ror #4
+	and 	r8, r7, r3, ror #18
+	eor 	r3, r3, r8, ror #2
+	and 	r8, r7, r3, ror #30
+	eor 	r3, r3, r8, ror #24
+	and 	r8, r7, r4, ror #6
+	eor 	r4, r4, r8, ror #4
+	and 	r8, r7, r4, ror #18
+	eor 	r4, r4, r8, ror #2
+	and 	r8, r7, r4, ror #30
+	eor 	r4, r4, r8, ror #24
+	and 	r8, r7, r5, ror #6
+	eor 	r5, r5, r8, ror #4
+	and 	r8, r7, r5, ror #18
+	eor 	r5, r5, r8, ror #2
+	and 	r8, r7, r5, ror #30
+	eor 	r5, r5, r8, ror #24
+	ldrd 	r10, r11, [r1], #-8
+	ldrd 	r8, r9, [r1], #-8
+	eor 	r2, r2, r8 				//add rkey + rconst
+	eor 	r3, r3, r9 				//add rkey + rconst
+	eor 	r4, r4, r10 			//add rkey + rconst
+	eor 	r5, r5, r11 			//add rkey + rconst
+	ldrd 	r10, r11, [r0], #-8
+	ldrd 	r8, r9, [r0], #-8
+	eor 	r2, r2, r8 				//add rtk1
+	eor 	r3, r3, r9 				//add rtk1
+	eor 	r4, r4, r10 			//add rtk1
+	eor 	r5, r5, r11 			//add rtk1
+	eor 	r8, r2, r5
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r5, r5, r8				//SWAPMOVE(r5, r2, 0x55555555, 0);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	eor 	r8, r5, r2, lsr #1
+	and 	r8, r8, r6
+	eor 	r5, r5, r8
+	eor 	r2, r2, r8, lsl #1 		//SWAPMOVE(r2, r5, 0x55555555, 1);
+	eor 	r8, r2, r3, lsr #1
+	and 	r8, r8, r6
+	eor 	r2, r2, r8
+	eor 	r3, r3, r8, lsl #1 		//SWAPMOVE(r3, r2, 0x55555555, 1);
+	orr 	r8, r4, r5
+	eor 	r3, r3, r8
+	mvn 	r3, r3
+	eor 	r8, r4, r5, lsr #1
+	and 	r8, r8, r6
+	eor 	r4, r4, r8
+	eor 	r5, r5, r8, lsl #1 		//SWAPMOVE(r5, r4, 0x55555555, 1);
+	eor 	r8, r3, r4, lsr #1
+	and 	r8, r8, r6
+	eor 	r3, r3, r8
+	eor 	r4, r4, r8, lsl #1 		//SWAPMOVE(r4, r3, 0x55555555, 1);
+	orr 	r8, r2, r3
+	eor 	r5, r5, r8
+	mvn 	r5, r5
+	bx 		lr
+
+/******************************************************************************
+* Encrypt a single block using fixsliced SKINNY-128-128.
+******************************************************************************/
+@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
+.global skinny128_384
+.type   skinny128_384,%function
+.align 2
+skinny128_384:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 			//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 			//r11 <- 0x30303030
+	bl 		packing
+	mov 	r7, r11
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		quadruple_round
+	sub.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		quadruple_round
+	bl 		quadruple_round
+	bl 		unpacking
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr   
+
+/******************************************************************************
+* Decrypt a single block using fixsliced SKINNY-128-128.
+******************************************************************************/
+@ void skinny128_384_inv(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
+.global skinny128_384_inv
+.type   skinny128_384_inv,%function
+.align 2
+skinny128_384_inv:
+	push 	{r0-r12, r14}
+	mov.w 	r0, r3
+	ldr.w 	r3, [r2, #8]
+	ldr.w 	r4, [r2, #4]
+	ldr.w 	r5, [r2, #12]
+	ldr.w 	r2, [r2]
+	movw 	r10, #0x0a0a
+	movt 	r10, #0x0a0a 			//r10 <- 0x0a0a0a0a
+	movw 	r11, #0x3030
+	movt 	r11, #0x3030 			//r11 <- 0x30303030
+	bl 		packing
+	mov 	r7, r11
+	movw 	r6, #0x5555
+	movt 	r6, #0x5555 			//r6 <- 0x55555555
+	add.w 	r0, #120 				// points to the right rtk1
+	add.w 	r1, #888
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	add.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	add.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	add.w 	r0, #256 				// rtk1 repeats every 16 rounds
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		inv_quadruple_round
+	bl 		unpacking
+	ldr.w 	r0, [sp], #4
+	strd 	r2, r4, [r0]
+	strd 	r3, r5, [r0, #8]
+    pop 	{r1-r12,r14}
+    bx 		lr
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_1/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/crypto_aead.h
+//API required by the NIST for the LWC competition
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k);
+
+//API required by the NIST for the LWC competition
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k);
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
+* functions.
+******************************************************************************/
+static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, u32* rtk1,
+                    u32* rtk2_3, u64 mlen, const u8* ad, u64 adlen) {
+    u64 lfsr = 1;
+    u8 feedback;
+    u8 tmp[2*BLOCKBYTES];
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    SET_DOMAIN(tmp, 0x02);
+    while (adlen >= 2*BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
+        tkschedule_perm_tk1(rtk1, tmp, tmp+BLOCKBYTES);
+        skinny128_384(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= 2*BLOCKBYTES;
+        ad += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > BLOCKBYTES) {                       // pad and process 2 blocs
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x03);         // domain for padding ad
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        memset(tmp, 0x00, BLOCKBYTES);
+        memcpy(tmp, ad + BLOCKBYTES, adlen);
+        tmp[adlen] ^= 0x80;                         // padding
+        skinny128_384(tmp + BLOCKBYTES, tmp, ad, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+    } else if (adlen == BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        if (mlen == 0) {                // if tag has *NOT* been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp, tag);
+            skinny128_384(auth, c, ad, c, rtk1, rtk2_3); 
+        } else {                        // if tag has  been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp, tmp);    // process last ad block
+            skinny128_384(auth, auth, ad, ad, rtk1, rtk2_3);
+        }
+    } else if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);                      // domain for padding ad
+        memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
+        memcpy(tmp + BLOCKBYTES, ad, adlen);        // padding
+        tmp[BLOCKBYTES + adlen] ^= 0x80;            // padding
+        if (mlen == 0) {                // if tag has *NOT* been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp, tag);    // compute the tag
+            skinny128_384(auth, c, tmp + BLOCKBYTES, c, rtk1, rtk2_3); 
+        } else {                        // if tag has been calculated yet
+            tkschedule_perm_tk1(rtk1, tmp,  tmp);   // process last ad block
+            skinny128_384(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, rtk1, rtk2_3);
+        }
+    }
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u8 feedback;
+    u64 i,lfsr = 1;
+    u32 rtk1[8*16];
+    u32 rtk2_3[8*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
+    tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(c, c + BLOCKBYTES, m, m + BLOCKBYTES, rtk1, rtk2_3);
+        xor_block(sum, m);                 // sum for tag computation
+        xor_block(sum, m + BLOCKBYTES);    // sum for tag computation
+        mlen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (mlen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x01); // domain for padding m
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(c, auth, m, auth, rtk1, rtk2_3);
+        xor_block(sum, m);
+        for(i = 0; i < mlen - BLOCKBYTES; i++) {
+            c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
+            sum[i] ^= m[BLOCKBYTES + i]; 
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        m += mlen;
+        c += mlen;
+        mlen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (mlen == BLOCKBYTES) {        // last block is full
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x04); // domain for tag computation
+        xor_block(sum, m);                  // sum for tag computation
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(c, sum, m, sum, rtk1, rtk2_3);
+        c += BLOCKBYTES;
+    } else if (mlen > 0) {                  // last block is partial
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x05); // domain for tag computation
+        for(i = 0; i < mlen; i++)           // sum for tag computation
+            sum[i] ^= m[i];                 // sum for tag computation
+        sum[i] ^= 0x80;                     // padding
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384(auth, sum, auth, sum, rtk1, rtk2_3);
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];          // encrypted padded block
+        c += mlen;
+    }
+    if (mlen == 0) {            // if tag has *NOT* been calculated yet 
+        LE_STR_64(tag, lfsr);   // lfsr for tag computation                            
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
+            tkschedule_perm_tk1(rtk1, tag, tag);
+            skinny128_384(sum, sum, sum, sum,  rtk1, rtk2_3); // compute the tag
+        }
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, mlen, ad, adlen);
+    xor_block(sum, auth);
+    memcpy(c, sum, TAGBYTES);
+    // ----------------- Process the associated data -----------------
+
+    return 0;
+}
+
+
+/******************************************************************************
+* Decryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u8 feedback;
+    u64 i,lfsr = 1;
+    u32 rtk1[8*16];
+    u32 rtk2_3[8*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES];
+    u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
+    tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
+    tkschedule_perm(rtk2_3);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_inv(m, m + BLOCKBYTES, c, c + BLOCKBYTES, rtk1, rtk2_3);
+        xor_block(sum, m);                 // sum for tag computation
+        xor_block(sum, m + BLOCKBYTES);    // sum for tag computation
+        clen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (clen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
+        xor_block(sum, m);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x01);              // domain for padding m
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < clen - BLOCKBYTES; i++) {
+            m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
+            sum[i] ^= m[BLOCKBYTES + i]; 
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        c += clen;
+        clen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (clen == BLOCKBYTES) {        // last block is full
+        LE_STR_64(tmp, lfsr);
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
+        xor_block(sum, m);                  // sum for tag computation
+        SET_DOMAIN(tag, 0x04);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        c += BLOCKBYTES;
+        clen = 0;
+    } else if (clen > 0) {                  // last block is partial
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        tkschedule_perm_tk1(rtk1, tmp, tmp);
+        skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];          // encrypted padded block
+            sum[i] ^= m[i];                 // sum for tag computation
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        c += clen;
+        clen = 0;
+    }
+    if (clen == 0) {                    // if tag has *NOT* been calculated yet
+        LE_STR_64(tag, lfsr);
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
+            tkschedule_perm_tk1(rtk1, tag, tag); //if AD can be processed in //
+            skinny128_384(sum, sum, sum, sum, rtk1, rtk2_3); // compute the tag
+        }
+    }
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, clen, ad, adlen);
+    xor_block(sum, auth);
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];  // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+#define SKINNY128_384_ROUNDS	56
+
+extern void skinny128_384(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void skinny128_384_inv(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const u32* rtk1, const u32* rtk2_3);
+extern void tkschedule_lfsr_2(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void pack_tk1(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
+extern void tkschedule_lfsr_3(u32* rtk, const u8* tk3, const u8* tk3_bis, const int rounds);
+extern void tkschedule_perm(u32* rtk);
+extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.s
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinny128.s
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/armcortexm_2/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+//x ^= y with x, y 128-bit blocks
+#define XOR_BLOCK(x,y) ({               \
+    ((u32*)(x))[0] ^= ((u32*)(y))[0];   \
+    ((u32*)(x))[1] ^= ((u32*)(y))[1];   \
+    ((u32*)(x))[2] ^= ((u32*)(y))[2];   \
+    ((u32*)(x))[3] ^= ((u32*)(y))[3];   \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    precompute_rtk2_3(rtk2_3, npub, k);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(c + mlen, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);     // precompute RTK1 given the LFSR
+        skinny128_384_encrypt(c, m, rtk1, rtk2_3);
+        xor_block(c + mlen, m);         // sum for tag computation
+        mlen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update lfsr for next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (mlen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        for(i = 0; i < mlen; i++)
+            c[mlen + i] ^= m[i];        // sum for tag computation
+        c[mlen + i] ^= 0x80;            // padding
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_encrypt(auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];      // encrypted padded block
+        c += mlen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    precompute_rtk1(rtk1, tmp);
+    for(int i = 0; i < 16; i++) {
+        printf("%08x %08x %08x %08x\n",rtk1[i*4], rtk1[i*4+1],rtk1[i*4+2],rtk1[i*4+3]);
+    }
+    for(int i = 0; i < 56; i++) {
+        printf("%08x %08x %08x %08x\n",rtk2_3[i*4], rtk2_3[i*4+1],rtk2_3[i*4+2],rtk2_3[i*4+3]);
+    }
+    skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        precompute_rtk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384_encrypt(tmp, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+    }
+    xor_block(c, auth);                 // XOR for tag computation
+    // ----------------- Process the associated data -----------------
+    return 0;
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    u32 rtk1[4*16];
+    u32 rtk2_3[4*SKINNY128_384_ROUNDS];
+    u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    precompute_rtk2_3(rtk2_3, npub, k);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= BLOCKBYTES) {        // while entire blocks to process
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);     // precompute RTK1 given the LFSR
+        skinny128_384_decrypt(m, c, rtk1, rtk2_3);
+        xor_block(sum, m);              // sum for tag computation
+        clen -= BLOCKBYTES;
+        c += BLOCKBYTES;
+        m += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);              // update LFSR for the next block
+    }
+    SET_DOMAIN(tmp, 0x04);              // domain for tag computation
+    if (clen > 0) {                     // last block is partial
+        LE_STR_64(tmp, lfsr);           // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);          // domain for padding
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_encrypt(auth, auth, rtk1, rtk2_3);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];      // encrypted padded block
+            sum[i] ^= m[i];             // sum for tag computation
+        }
+        sum[i] ^= 0x80;                 // padding
+        c += clen;
+        SET_DOMAIN(tmp, 0x05);          // domain for tag computation
+        UPDATE_LFSR(lfsr);
+    }
+    LE_STR_64(tmp, lfsr);               // lfsr for tag computation                                  
+    precompute_rtk1(rtk1, tmp);
+    skinny128_384_encrypt(sum, sum, rtk1, rtk2_3); // compute the tag
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    lfsr = 1;
+    SET_DOMAIN(tmp, 0x02);
+    memset(auth, 0x00, BLOCKBYTES);
+    while (adlen >= BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        precompute_rtk1(rtk1, tmp);
+        skinny128_384_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        ad += BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);          // domain for padding ad
+        precompute_rtk1(rtk1, tmp);
+        memset(tmp, 0x00, BLOCKBYTES);  // padding
+        memcpy(tmp, ad, adlen);         // padding
+        tmp[adlen] ^= 0x80;             // padding
+        skinny128_384_encrypt(tmp, tmp, rtk1, rtk2_3);
+        xor_block(auth, tmp);
+    }
+    xor_block(sum, auth);               // XOR for tag computation
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];      // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
+/******************************************************************************
+* Constant-time implementation of the SKINNY tweakable block ciphers.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. Therefore, this 
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
+* turn it into a 4-bit S-box computation. Although the last bit permutation
+* within the Sbox is not computed, the bit ordering is synchronized with the 
+* classical representation after 2 calls.
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 1
+******************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 2
+******************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/******************************************************************************
+* The MixColumns computation for rounds i such that (i % 4) == 3
+******************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+	}
+}
+
+/******************************************************************************
+* The inverse MixColumns operation for rounds i such that (i % 4) == 0
+******************************************************************************/
+void inv_mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 4; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/******************************************************************************
+* Encryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, 
+				const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
+	QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
+	QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
+	QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
+
+/******************************************************************************
+* Decryption of a single block without any operation mode using SKINNY-128-384.
+* RTK1 and RTK2_3 are given separately to take advantage of the fact that
+* TK2 and TK3 remains the same through the entire data encryption/decryption.
+******************************************************************************/
+void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, 
+				const u32* rtk2_3) {
+	u32 tmp; 					// used in SWAPMOVE macro
+	u32 state[4]; 				// 128-bit state
+	packing(state, ptext); 		// from byte to bitsliced representation
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+192);
+	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
+	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+128);
+	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
+	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3+64);
+	INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
+	INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
+	INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
+	INV_QUADRUPLE_ROUND(state, rtk1, 	rtk2_3);
+	unpacking(ctext, state);	// from bitsliced to byte representation
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+
+#include "tk_schedule.h"
+
+void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
+
+#define SKINNY128_384_ROUNDS	56
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	mixcolumns_0(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	mixcolumns_1(state);								\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	mixcolumns_2(state);								\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	mixcolumns_3(state);								\
+})
+
+#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({		\
+	inv_mixcolumns_3(state);							\
+	state[0] ^= (rtk1)[12];								\
+	state[1] ^= (rtk1)[13];								\
+	state[2] ^= (rtk1)[14];								\
+	state[3] ^= (rtk1)[15];								\
+	state[0] ^= (rtk2_3)[12];							\
+	state[1] ^= (rtk2_3)[13];							\
+	state[2] ^= (rtk2_3)[14];							\
+	state[3] ^= (rtk2_3)[15];							\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	inv_mixcolumns_2(state); 							\
+	state[0] ^= (rtk1)[8];								\
+	state[1] ^= (rtk1)[9];								\
+	state[2] ^= (rtk1)[10];								\
+	state[3] ^= (rtk1)[11];								\
+	state[0] ^= (rtk2_3)[8];							\
+	state[1] ^= (rtk2_3)[9];							\
+	state[2] ^= (rtk2_3)[10];							\
+	state[3] ^= (rtk2_3)[11];							\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	inv_mixcolumns_1(state); 							\
+	state[0] ^= (rtk1)[4];								\
+	state[1] ^= (rtk1)[5];								\
+	state[2] ^= (rtk1)[6];								\
+	state[3] ^= (rtk1)[7];								\
+	state[0] ^= (rtk2_3)[4];							\
+	state[1] ^= (rtk2_3)[5];							\
+	state[2] ^= (rtk2_3)[6];							\
+	state[3] ^= (rtk2_3)[7];							\
+	SWAPMOVE(state[1], state[2], 0x55555555, 0);		\
+	state[3] ^= (state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]); 				\
+	inv_mixcolumns_0(state); 							\
+	state[0] ^= (rtk1)[0];								\
+	state[1] ^= (rtk1)[1];								\
+	state[2] ^= (rtk1)[2];								\
+	state[3] ^= (rtk1)[3];								\
+	state[0] ^= (rtk2_3)[0];							\
+	state[1] ^= (rtk2_3)[1];							\
+	state[2] ^= (rtk2_3)[2];							\
+	state[3] ^= (rtk2_3)[3];							\
+	SWAPMOVE(state[3], state[0], 0x55555555, 0);		\
+	state[1] ^= (state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+	SWAPMOVE(state[0], state[3], 0x55555555, 1);		\
+	SWAPMOVE(state[1], state[0], 0x55555555, 1);		\
+	state[1] ^= ~(state[2] | state[3]);					\
+	SWAPMOVE(state[3], state[2], 0x55555555, 1);		\
+	SWAPMOVE(state[2], state[1], 0x55555555, 1);		\
+	state[3] ^= ~(state[0] | state[1]);					\
+})
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
+/******************************************************************************
+* Implementation of the SKINNY tweakey schedule to match fixslicing.
+* 
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h> 		//for memcmp
+#include "tk_schedule.h"
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/******************************************************************************
+* The round constants according to the new representation.
+******************************************************************************/
+u32 rconst_32_bs[224] = {
+	0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
+	0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
+	0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
+	0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
+	0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
+	0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
+	0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
+	0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
+	0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff, 
+	0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
+	0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
+	0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
+	0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
+	0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
+	0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
+	0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
+	0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
+	0x00000010, 0x00000000, 0x00010010, 0xfffffbff, 0x00000014, 0xffffffef,
+	0x00000000, 0x00000040, 0x00000100, 0x00000000, 0x10000040, 0xfffffeff,
+	0x44000000, 0xfffffeff, 0x00000000, 0x00000000, 0x00000000, 0x00100000,
+	0x04000001, 0xffffffff, 0x00040000, 0xffffffff, 0x00400000, 0x00000000,
+	0x00000000, 0x00000000, 0x00001000, 0xfebfffff, 0x01004400, 0xffffffff,
+	0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00010000, 0xffffffff,
+	0x00000004, 0xffffffbf, 0x00000040, 0x00000000, 0x00000000, 0x00000000,
+	0x10000100, 0xfffffebf, 0x44000100, 0xffffffff, 0x00000000, 0x04000000,
+	0x00100000, 0x00100000, 0x00000001, 0xffffffff, 0x00040000, 0xffafffff,
+	0x00400000, 0x00000000, 0x00000000, 0x00000000, 0x01401000, 0xffbfffff,
+	0x01004000, 0xfffffbff, 0x00000000, 0x00000400, 0x00000010, 0x00000000,
+	0x00010010, 0xffffffff
+};
+
+/******************************************************************************
+* 	Pack the input into the bitsliced representation
+* 	24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
+* 	25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
+* 	26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
+* 	27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
+******************************************************************************/
+void packing(u32* out, const u8* in) {
+	u32 tmp;
+	LE_LOAD(out, in);
+	LE_LOAD(out + 1, in + 8);
+	LE_LOAD(out + 2, in + 4);
+	LE_LOAD(out + 3, in + 12);
+	SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[3], out[0], 0x03030303, 6);
+	SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[3], out[2], 0x03030303, 4);
+	SWAPMOVE(out[3], out[1], 0x03030303, 2);
+}
+
+/******************************************************************************
+* Unpack the input to a byte-wise representation
+******************************************************************************/
+void unpacking(u8* out, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[3], in[1], 0x03030303, 2);
+	SWAPMOVE(in[3], in[2], 0x03030303, 4);
+	SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[3], in[0], 0x03030303, 6);
+	SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
+	SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
+	LE_STORE(out, in[0]);
+	LE_STORE(out + 8, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out + 12, in[3]);
+}
+
+/******************************************************************************
+* 	0 4        1 5
+* 	1 5  --->  2 6
+* 	2 6        3 7
+* 	3 7        4 0
+******************************************************************************/
+void lfsr2_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[0] = tk[1];
+	tk[1] = tk[2];
+	tk[2] = tk[3];
+	tk[3] = tmp;
+}
+
+/******************************************************************************
+* 	0 4        7 3
+* 	1 5  --->  0 4
+* 	2 6        1 5
+* 	3 7        2 6
+******************************************************************************/
+void lfsr3_bs(u32* tk) {
+	u32 tmp;
+	tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
+	tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
+	tk[3] = tk[2];
+	tk[2] = tk[1];
+	tk[1] = tk[0];
+	tk[0] = tmp;
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, twice
+******************************************************************************/
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 4 times
+******************************************************************************/
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 6 times
+******************************************************************************/
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 8 times
+******************************************************************************/
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 10 times
+******************************************************************************/
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 12 times
+******************************************************************************/
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+/******************************************************************************
+* Apply the permutation in a bitsliced manner, 14 times
+******************************************************************************/
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 4; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00;
+		tk[i] |= ROR(tmp,14) & 0x00cc0000;
+		tk[i] |= ROR(tmp,30) & 0xcc000000;
+		tk[i] |= ROR(tmp,16) & 0x000000ff;
+		tk[i] |= ROR(tmp,18) & 0x33003300;
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK2
+******************************************************************************/
+void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
+	u32 tk2[4];
+	packing(tk2, key);
+	memcpy(tk, tk2, 16);
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr2_bs(tk2);
+		memcpy(tk+i*4+4, tk2, 16);
+	}
+}
+
+/******************************************************************************
+* Precompute all LFSRs on TK3
+******************************************************************************/
+void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
+	u32 tk3[4];
+	packing(tk3, key);
+	tk[0] ^= tk3[0];
+	tk[1] ^= tk3[1];
+	tk[2] ^= tk3[2];
+	tk[3] ^= tk3[3];
+	for(int i = 0 ; i < rounds; i+=2) {
+		lfsr3_bs(tk3);
+		tk[i*4+4] ^= tk3[0];
+		tk[i*4+5] ^= tk3[1];
+		tk[i*4+6] ^= tk3[2];
+		tk[i*4+7] ^= tk3[3];
+	}
+}
+
+/******************************************************************************
+* XOR TK with TK1 before applying the permutations.
+* The key is then rearranged to match the barrel shiftrows representation.
+******************************************************************************/
+void permute_tk(u32* tk, const u8* key, const int rounds) {
+	u32 test;
+	u32 tk1[4], tmp[4];
+	packing(tk1, key);
+	memcpy(tmp, tk, 16);
+	tmp[0] ^= tk1[0];
+	tmp[1] ^= tk1[1];
+	tmp[2] ^= tk1[2];
+	tmp[3] ^= tk1[3];
+	for(int i = 0 ; i < rounds; i += 8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		tk[i*4] = tmp[2] & 0xf0f0f0f0;
+		tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
+		tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
+		tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+4, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 					// applies P^2
+		else
+			permute_tk_10(tmp); 				// applies P^10
+		tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
+		tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
+		tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+12, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 					// applies P^4
+		else
+			permute_tk_12(tmp); 				// applies P^12
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
+			tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
+		}
+		tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
+		tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
+		tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
+		tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*4+20, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 					//	applies P^6
+		else
+			permute_tk_14(tmp); 				// applies P^14
+		tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
+		tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
+		tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
+		tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
+		tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
+		tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
+		tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*4+28, 16);
+		XOR_BLOCKS(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 					// applies P^8
+		for(int j = 0; j < 4; j++) {
+			tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
+			tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
+		}
+		if (test && (i+8 < rounds)) { 			//only if next loop iteration
+			tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
+			tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
+			tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
+			tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
+		}
+	}
+}
+
+/******************************************************************************
+* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
+******************************************************************************/
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
+	memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
+	precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
+	permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS);	// rtk+8 is NULL
+	for(int i = 0; i < SKINNY128_384_ROUNDS; i++) {			// add rconsts
+		for(int j = 0; j < 4; j++)
+			rtk[i*4+j] ^= rconst_32_bs[i*4+j];
+	}
+}
+
+/******************************************************************************
+* Precompute RTK1.
+******************************************************************************/
+void precompute_rtk1(u32* rtk1, const u8* tk1) {
+	memset(rtk1, 0x00, 16*16);
+	permute_tk(rtk1, tk1, 16);
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
+#ifndef TK_SCHEDULE_H_
+#define TK_SCHEDULE_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+void packing(u32* out, const u8* in);
+void unpacking(u8* out, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
+void precompute_rtk1(u32* rtk1, const u8* tk1);
+
+#define ROR(x,y) 		(((x) >> (y)) | ((x) << (32 - (y))))
+
+#define XOR_BLOCKS(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+})
+	
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24;
+
+#endif  // TK_SCHEDULE_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/api.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/crypto_aead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
+/******************************************************************************
+* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
+*
+* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
+*
+* For more details, see the paper at: https://
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University,
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     May 2020
+******************************************************************************/
+#include "skinny128.h"
+#include "skinnyaead.h"
+#include <string.h>
+#include <stdio.h>
+
+/******************************************************************************
+* x ^= y where x, y are 128-bit blocks (16 bytes array).
+******************************************************************************/
+static void xor_block(u8 * x, const u8* y) {
+    for(int i = 0; i < BLOCKBYTES; i++)
+        x[i] ^= y[i];
+}
+
+/******************************************************************************
+* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
+* functions.
+******************************************************************************/
+static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
+                    u64 mlen, const u8* ad, u64 adlen) {
+    u64 lfsr = 1;
+    u8 feedback;
+    u8 tmp[2*BLOCKBYTES];
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    SET_DOMAIN(tmp, 0x02);
+    while (adlen >= 2*BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
+        precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
+        skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+        adlen -= 2*BLOCKBYTES;
+        ad += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    if (adlen > BLOCKBYTES) {               // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x03); // domain for padding ad
+        precompute_rtk1(tk->rtk1, tmp, tmp + BLOCKBYTES);
+        adlen -= BLOCKBYTES;
+        memset(tmp, 0x00, BLOCKBYTES);
+        memcpy(tmp, ad + BLOCKBYTES, adlen);
+        tmp[adlen] ^= 0x80;                 // padding
+        skinny128_384_encrypt(tmp + BLOCKBYTES, tmp, ad, tmp, *tk);
+        xor_block(auth, tmp);
+        xor_block(auth, tmp + BLOCKBYTES);
+    } else if (adlen == BLOCKBYTES) {
+        LE_STR_64(tmp, lfsr);
+        if (mlen == 0) {    // if tag has *NOT* been calculated yet
+            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
+            skinny128_384_encrypt(auth, c, ad, c, *tk); 
+        } else {            // if tag has  been calculated yet
+            precompute_rtk1(tk->rtk1, tmp, tmp);    // process last ad block
+            skinny128_384_encrypt(auth, auth, ad, ad, *tk);
+        }
+    } else if (adlen > 0) {
+        LE_STR_64(tmp, lfsr);
+        SET_DOMAIN(tmp, 0x03);                      // domain for padding ad
+        memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
+        memcpy(tmp + BLOCKBYTES, ad, adlen);        // padding
+        tmp[BLOCKBYTES + adlen] ^= 0x80;            // padding
+        if (mlen == 0) {    // if tag has *NOT* been calculated yet
+            precompute_rtk1(tk->rtk1, tmp, tag);    // compute the tag
+            skinny128_384_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); 
+        } else {            // if tag has been calculated yet
+            precompute_rtk1(tk->rtk1, tmp,  tmp);   // process last ad block
+            skinny128_384_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
+        }
+    }
+}
+
+/******************************************************************************
+* Encryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
+                    const unsigned char *m, unsigned long long mlen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *nsec,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    tweakey tk;
+    u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    // ----------------- Initialization -----------------
+    *clen = mlen + TAGBYTES;
+    precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(c + mlen, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (mlen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);  // lfsr for 2nd block
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_encrypt(c, c + BLOCKBYTES, m, m + BLOCKBYTES, tk);
+        xor_block(c + mlen, m);                 // sum for tag computation
+        xor_block(c + mlen, m + BLOCKBYTES);    // sum for tag computation
+        mlen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (mlen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);  // lfsr for 2nd block
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x01);       // domain for padding m
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_encrypt(c, auth, m, auth, tk);
+        xor_block(c + mlen, m);
+        for(i = 0; i < mlen - BLOCKBYTES; i++) {
+            c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
+            c[mlen + i] ^= m[BLOCKBYTES + i]; 
+        }
+        c[mlen + i] ^= 0x80;                    // padding
+        SET_DOMAIN(tag, 0x05);                  // domain for tag computation
+        m += mlen;
+        c += mlen;
+        mlen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (mlen == BLOCKBYTES) {            // last block is full
+        LE_STR_64(tmp, lfsr);                   // lfsr for last full block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);      // lfsr for tag computation
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x04);     // domain for tag computation
+        xor_block(c + mlen, m);                 // sum for tag computation
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_encrypt(c, c + mlen, m, c + mlen, tk);
+        c += BLOCKBYTES;
+    } else if (mlen > 0) {                      // last block is partial
+        LE_STR_64(tmp, lfsr);               // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);       // lfsr for tag computation
+        SET_DOMAIN(tmp + BLOCKBYTES, 0x05);      // domain for tag computation
+        for(i = 0; i < mlen; i++)  // sum for tag computation
+            c[mlen + i] ^= m[i];                // sum for tag computation
+        c[mlen + i] ^= 0x80;                    // padding
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_encrypt(auth, c + mlen, auth, c + mlen, tk);
+        for(i = 0; i < mlen; i++)
+            c[i] = auth[i] ^ m[i];               // encrypted padded block
+        c += mlen;
+    }
+    if (mlen == 0) {    // if tag has *NOT* been calculated yet 
+        LE_STR_64(tag, lfsr);               // lfsr for tag computation                                     
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {    //if all AD can be processed in //
+            precompute_rtk1(tk.rtk1, tag, tag);
+            skinny128_384_encrypt(c, c, c, c, tk); // compute the tag
+        }
+    }
+    // ----------------- Process the plaintext -----------------
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, c, tag, &tk, mlen, ad, adlen);
+    xor_block(c, auth);
+    // ----------------- Process the associated data -----------------
+
+    return 0;
+}
+
+
+/******************************************************************************
+* Decryption and authentication using SKINNY-AEAD-M1
+******************************************************************************/
+int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
+                    unsigned char *nsec,
+                    const unsigned char *c, unsigned long long clen,
+                    const unsigned char *ad, unsigned long long adlen,
+                    const unsigned char *npub,
+                    const unsigned char *k) {
+    u64 i,lfsr = 1;
+    u8 feedback;
+    tweakey tk;
+    u8 tmp[2*BLOCKBYTES];
+    u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
+    (void)nsec;
+
+    if (clen < TAGBYTES)
+        return -1;
+
+    // ----------------- Initialization -----------------
+    clen -= TAGBYTES;
+    *mlen = clen;
+    precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
+    memset(tmp, 0x00, 2*BLOCKBYTES);
+    memset(tag, 0x00, BLOCKBYTES);
+    memset(auth, 0x00, BLOCKBYTES);
+    memset(sum, 0x00, BLOCKBYTES);
+    // ----------------- Initialization -----------------
+
+    // ----------------- Process the plaintext -----------------
+    while (clen >= 2*BLOCKBYTES) {          // process 2 blocks in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp + BLOCKBYTES, lfsr);  // lfsr for 2nd block
+        precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
+        skinny128_384_decrypt(m, m + BLOCKBYTES, c, c + BLOCKBYTES, tk);
+        xor_block(sum, m);                 // sum for tag computation
+        xor_block(sum, m + BLOCKBYTES);    // sum for tag computation
+        clen -= 2*BLOCKBYTES;
+        c += 2*BLOCKBYTES;
+        m += 2*BLOCKBYTES;
+        UPDATE_LFSR(lfsr);
+    }
+    SET_DOMAIN(tag, 0x04);                  // domain for tag computation
+    if (clen > BLOCKBYTES) {                // pad and process 2 blocs in //
+        LE_STR_64(tmp, lfsr);               // lfsr for 1st block
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_decrypt(m, m, c, c, tk);
+        xor_block(sum, m);
+        UPDATE_LFSR(lfsr);
+        LE_STR_64(tmp, lfsr);               // lfsr for 2nd block
+        SET_DOMAIN(tmp, 0x01);              // domain for padding m
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_encrypt(auth, auth, auth, auth, tk);
+        for(i = 0; i < clen - BLOCKBYTES; i++) {
+            m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
+            sum[i] ^= m[BLOCKBYTES + i]; 
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        m += clen;
+        c += clen;
+        clen = 0;
+        UPDATE_LFSR(lfsr);
+    } else if (clen == BLOCKBYTES) {        // last block is full
+        LE_STR_64(tmp, lfsr);               // lfsr for last full block
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_decrypt(m, m, c, c, tk);
+        xor_block(sum, m);                  // sum for tag computation
+        SET_DOMAIN(tag, 0x04);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        c += BLOCKBYTES;
+        clen = 0;
+    } else if (clen > 0) {                  // last block is partial
+        LE_STR_64(tmp, lfsr);               // lfsr for last block
+        SET_DOMAIN(tmp, 0x01);              // domain for padding
+        precompute_rtk1(tk.rtk1, tmp, tmp);
+        skinny128_384_encrypt(auth, auth, auth, auth, tk);
+        for(i = 0; i < clen; i++) {
+            m[i] = auth[i] ^ c[i];          // encrypted padded block
+            sum[i] ^= m[i];                 // sum for tag computation
+        }
+        sum[i] ^= 0x80;                     // padding
+        SET_DOMAIN(tag, 0x05);              // domain for tag computation
+        UPDATE_LFSR(lfsr);
+        m += clen;
+        c += clen;
+        clen = 0;
+    }
+    if (clen == 0) {                // if tag has *NOT* been calculated yet
+        LE_STR_64(tag, lfsr);       // lfsr for tag computation                        
+        if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
+            precompute_rtk1(tk.rtk1, tag, tag); //if AD can be processed in //
+            skinny128_384_encrypt(sum, sum, sum, sum, tk); // compute the tag
+        }
+    }
+
+    // ----------------- Process the associated data -----------------
+    skinny_aead_m1_auth(auth, sum, tag, &tk, clen, ad, adlen);
+    xor_block(sum, auth);
+    feedback = 0;
+    for(i = 0; i < TAGBYTES; i++)
+        feedback |= sum[i] ^ c[i];  // constant-time tag verification
+    return feedback;
+    // ----------------- Process the associated data -----------------
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
+/******************************************************************************
+* Fixsliced implementation of SKINNY-128-384.
+* Two blocks are processed in parallel.
+*
+* This implementation doesn't compute the ShiftRows operation. Some masks and
+* shifts are applied during the MixColumns operation so that the proper bits
+* are XORed together. Moreover, the row permutation within the MixColumns 
+* is omitted, as well as the bit permutation at the end of the Sbox. The rows
+* are synchronized with the classical after only 4 rounds. However, the Sbox
+* permutation requires 8 rounds for a synchronization. To limit the impact
+* on code size, we compute the permutation every 4 rounds. Therefore, this
+* implementation relies on a "QUADRUPLE_ROUND" routine.
+*
+* For more details, see the paper at: https://
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "skinny128.h"
+#include "tk_schedule.h"
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 0.
+****************************************************************************/
+void mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 1.
+****************************************************************************/
+void mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 2.
+****************************************************************************/
+void mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+	}
+}
+
+/****************************************************************************
+* The MixColumns operation for rounds i such that (i % 4) == 3.
+****************************************************************************/
+void mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
+****************************************************************************/
+void inv_mixcolumns_0(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],8) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,4);
+		tmp = ROR(state[i],24) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
+****************************************************************************/
+void inv_mixcolumns_1(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,2);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],16) & 0x30303030;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
+****************************************************************************/
+void inv_mixcolumns_2(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = ROR(state[i],24) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,2);
+		tmp = ROR(state[i],16) & 0x0c0c0c0c;
+		state[i] ^= ROR(tmp,28);
+		tmp = ROR(state[i],8) & 0xc0c0c0c0;
+		state[i] ^= ROR(tmp,6);
+	}
+}
+
+/****************************************************************************
+* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
+****************************************************************************/
+void inv_mixcolumns_3(u32* state) {
+	u32 tmp;
+	for(int i = 0; i < 8; i++) {
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,26);
+		tmp = state[i] & 0x30303030;
+		state[i] ^= ROR(tmp,4);
+		tmp = state[i] & 0x03030303;
+		state[i] ^= ROR(tmp,30);
+	}
+}
+
+/****************************************************************************
+* Adds the tweakey (including the round constants) to the state.
+****************************************************************************/
+void add_tweakey(u32* state, const u32* rtk1, const u32* rtk2_3) {
+	state[0] ^= rtk1[0] ^ rtk2_3[0];
+	state[1] ^= rtk1[1] ^ rtk2_3[1]; 
+	state[2] ^= rtk1[2] ^ rtk2_3[2];
+	state[3] ^= rtk1[3] ^ rtk2_3[3];
+	state[4] ^= rtk1[4] ^ rtk2_3[4];
+	state[5] ^= rtk1[5] ^ rtk2_3[5];
+	state[6] ^= rtk1[6] ^ rtk2_3[6];
+	state[7] ^= rtk1[7] ^ rtk2_3[7];
+}
+
+/****************************************************************************
+* Encryption of 2 blocks in parallel using SKINNY-128-384.
+* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
+* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
+****************************************************************************/
+void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk) {
+	u32 state[8];
+	packing(state, ptext, ptext_bis);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
+	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
+	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
+	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
+	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
+	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+320);
+	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+352);
+	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+384);
+	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+416);
+	unpacking(ctext, ctext_bis, state);
+}
+
+/****************************************************************************
+* Decryption of 2 blocks in parallel using SKINNY-128-384.
+* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
+* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
+****************************************************************************/
+void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, 
+					const u8* ctext_bis, const tweakey tk) {
+	u32 state[8];
+	packing(state, ctext, ctext_bis);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+416);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+384);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+352);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+320);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
+	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
+	unpacking(ptext, ptext_bis, state);
+}
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.h
+#ifndef SKINNY128_H_
+#define SKINNY128_H_
+#include "tk_schedule.h"
+
+void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk);
+
+void skinny128_384_decrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
+					const u8* ptext_bis, const tweakey tk);
+
+#define SKINNY128_128_ROUNDS	40
+#define SKINNY128_256_ROUNDS	48
+#define SKINNY128_384_ROUNDS	56
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({			\
+	state[3] ^= (state[0] | state[1]);					\
+	state[7] ^= (state[4] | state[5]);					\
+	state[1] ^= (state[6] | state[5]);					\
+	state[2] ^= (state[3] & state[7]);					\
+	state[6] ^= (~state[7] | state[4]);					\
+	state[0] ^= (state[2] | ~state[1]);					\
+	state[4] ^= (~state[3] | state[2]);					\
+	state[5] ^= (state[6] & state[0]);					\
+	add_tweakey(state, rtk1, rtk2_3); 					\
+	mixcolumns_0(state);								\
+	state[4] ^= (state[2] | state[3]);					\
+	state[5] ^= (state[6] | state[1]);					\
+	state[3] ^= (state[0] | state[1]);					\
+	state[7] ^= (state[4] & state[5]);					\
+	state[0] ^= (~state[5] | state[6]);					\
+	state[2] ^= (state[7] | ~state[3]);					\
+	state[6] ^= (~state[4] | state[7]);					\
+	state[1] ^= (state[0] & state[2]);					\
+	add_tweakey(state, rtk1+8, rtk2_3+8); 				\
+	mixcolumns_1(state);								\
+	state[6] ^= (state[7] | state[4]);					\
+	state[1] ^= (state[0] | state[3]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	state[5] ^= (state[6] & state[1]);					\
+	state[2] ^= (~state[1] | state[0]);					\
+	state[7] ^= (state[5] | ~state[4]);					\
+	state[0] ^= (~state[6] | state[5]);					\
+	state[3] ^= (state[2] & state[7]);					\
+	add_tweakey(state, rtk1+16, rtk2_3+16); 			\
+	mixcolumns_2(state);								\
+	state[0] ^= (state[5] | state[6]);					\
+	state[3] ^= (state[2] | state[4]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	state[1] ^= (state[0] & state[3]);					\
+	state[7] ^= (~state[3] | state[2]);					\
+	state[5] ^= (state[1] | ~state[6]);					\
+	state[2] ^= (~state[0] | state[1]);					\
+	state[4] ^= (state[7] & state[5]);					\
+	add_tweakey(state, rtk1+24, rtk2_3+24); 			\
+	mixcolumns_3(state);								\
+	state[0] ^= state[1]; 								\
+	state[1] ^= state[0]; 								\
+	state[0] ^= state[1]; 								\
+	state[2] ^= state[3]; 								\
+	state[3] ^= state[2]; 								\
+	state[2] ^= state[3]; 								\
+	state[4] ^= state[7]; 								\
+	state[7] ^= state[4]; 								\
+	state[4] ^= state[7]; 								\
+	state[5] ^= state[6]; 								\
+	state[6] ^= state[5]; 								\
+	state[5] ^= state[6]; 								\
+})
+
+#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({		\
+	state[0] ^= state[1]; 								\
+	state[1] ^= state[0]; 								\
+	state[0] ^= state[1]; 								\
+	state[2] ^= state[3]; 								\
+	state[3] ^= state[2]; 								\
+	state[2] ^= state[3]; 								\
+	state[4] ^= state[7]; 								\
+	state[7] ^= state[4]; 								\
+	state[4] ^= state[7]; 								\
+	state[5] ^= state[6]; 								\
+	state[6] ^= state[5]; 								\
+	state[5] ^= state[6]; 								\
+	inv_mixcolumns_3(state);							\
+	add_tweakey(state, rtk1+24, rtk2_3+24); 			\
+	state[4] ^= (state[7] & state[5]);					\
+	state[2] ^= (~state[0] | state[1]);					\
+	state[5] ^= (state[1] | ~state[6]);					\
+	state[7] ^= (~state[3] | state[2]);					\
+	state[1] ^= (state[0] & state[3]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	state[3] ^= (state[2] | state[4]);					\
+	state[0] ^= (state[5] | state[6]);					\
+	inv_mixcolumns_2(state);							\
+	add_tweakey(state, rtk1+16, rtk2_3+16); 			\
+	state[3] ^= (state[2] & state[7]);					\
+	state[0] ^= (~state[6] | state[5]);					\
+	state[7] ^= (state[5] | ~state[4]);					\
+	state[2] ^= (~state[1] | state[0]);					\
+	state[5] ^= (state[6] & state[1]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	state[1] ^= (state[0] | state[3]);					\
+	state[6] ^= (state[7] | state[4]);					\
+	inv_mixcolumns_1(state);							\
+	add_tweakey(state, rtk1+8, rtk2_3+8); 				\
+	state[1] ^= (state[0] & state[2]);					\
+	state[6] ^= (~state[4] | state[7]);					\
+	state[2] ^= (state[7] | ~state[3]);					\
+	state[0] ^= (~state[5] | state[6]);					\
+	state[7] ^= (state[4] & state[5]);					\
+	state[3] ^= (state[0] | state[1]);					\
+	state[5] ^= (state[6] | state[1]);					\
+	state[4] ^= (state[2] | state[3]);					\
+	inv_mixcolumns_0(state); 							\
+	add_tweakey(state, rtk1, rtk2_3); 					\
+	state[5] ^= (state[6] & state[0]);					\
+	state[4] ^= (~state[3] | state[2]);					\
+	state[0] ^= (state[2] | ~state[1]);					\
+	state[6] ^= (~state[7] | state[4]);					\
+	state[2] ^= (state[3] & state[7]);					\
+	state[1] ^= (state[6] | state[5]);					\
+	state[7] ^= (state[4] | state[5]);					\
+	state[3] ^= (state[0] | state[1]);					\
+})
+
+#endif  // SKINNY128_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
+#ifndef SKINNYAEADM1_H_
+#define SKINNYAEADM1_H_
+
+#include "skinny128.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define TAGBYTES    16
+#define KEYBYTES    16
+#define BLOCKBYTES  16
+
+#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
+
+#define UPDATE_LFSR(lfsr) ({                            \
+    feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00;   \
+    (lfsr) = ((lfsr) << 1) ^ feedback;                  \
+})
+
+#define LE_STR_64(ptr, x)  ({       \
+    (ptr)[0] = (u8)(x);             \
+    (ptr)[1] = (u8)((x) >> 8);      \
+    (ptr)[2] = (u8)((x) >> 16);     \
+    (ptr)[3] = (u8)((x) >> 24);     \
+    (ptr)[4] = (u8)((x) >> 32);     \
+    (ptr)[5] = (u8)((x) >> 40);     \
+    (ptr)[6] = (u8)((x) >> 48);     \
+    (ptr)[7] = (u8)((x) >> 56);     \
+})
+
+#endif  // SKINNYAEADM1_H_
\ No newline at end of file
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
+/*******************************************************************************
+* Implementation of the tweakey schedule according to the fixsliced 
+* representation.
+* 
+* For more details, see the paper at: https://
+*
+* @author	Alexandre Adomnicai, Nanyang Technological University,
+*			alexandre.adomnicai@ntu.edu.sg
+*
+* @date		May 2020
+*******************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "tk_schedule.h"
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+/****************************************************************************
+* The round constants according to the fixsliced representation.
+****************************************************************************/
+u32 rconst_32_bs[448] = {
+	0xfffffff3, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff, 
+	0xffffffff, 0x00000300, 0xcffffcff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xffcffffc, 0xffcfffff, 0xffcfffff, 
+	0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff, 
+	0xfcffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000c00,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c30, 0xfffcf3cf, 0xffffffff, 0xffffffcf, 
+	0xffffff03, 0xffffff3f, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff, 
+	0xfffffcff, 0x00000300, 0xcffffc3f, 0xfffffcff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xf3ffffff, 0x00000300,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x0c000000, 0xf3fffffc, 0xffcfffff, 0xffcfffff, 
+	0xffc3ffff, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff, 
+	0xffffffff, 0x03c00000, 0xfc3fcfff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000c00,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff33ff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000000, 0xfffcffcf, 0xffffffcf, 0xffffffcf, 
+	0xfffffff3, 0xffffff3f, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000f0, 0xffffff3f, 0xffffffff, 
+	0xfffffcff, 0x000000c0, 0xcffffc3f, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x0c000300,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x3ffffcff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xf3cffffc, 0xffffffff, 0xffcfffff, 
+	0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00300000, 0xffffffff, 0xffffffff, 
+	0xfcffffff, 0x00000000, 0xff3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
+	0xffffffff, 0x00000000, 0xffffffff, 0xffff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffff, 
+	0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff, 
+	0xffffffff, 0x00000000, 0xcffffcff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x3fffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xffcffffc, 0xffffffff, 0xffcfffff, 
+	0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xffffffff, 0xffffffff, 
+	0xfcffffff, 0x00000000, 0xfc3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xffff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffcf, 
+	0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff, 
+	0xffffffff, 0x00000300, 0xcffffc3f, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00300000, 0xfffffffc, 0xffcfffff, 0xffcfffff, 
+	0xff33ffff, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff, 
+	0xffffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xfffff3ff, 0x00000c00,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000c00, 0xfffcffcf, 0xffffffff, 0xffffffcf, 
+	0xffffffc3, 0xffffff3f, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00000030, 0xffffffff, 0xffffffff, 
+	0xfffffcff, 0x00000300, 0xcfffff3f, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000300,
+	0xffffffff, 0x00000000, 0xffffffff, 0x33ffffff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000000, 0xf3fffffc, 0xffcfffff, 0xffffffff, 
+	0xfff3ffff, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00000000, 0xff3fffff, 0xffffffff, 
+	0xffffffff, 0x03c00000, 0xffffcfff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+	0xffffffff, 0x00000000, 0xffffffff, 0xfcff33ff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000000, 0xfffcffff, 0xffffffcf, 0xffffffff, 
+	0xfffffff3, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x000000c0, 0xffffff3f, 0xffffffff, 
+	0xffffffff, 0x000003c0, 0xcffffcff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+	0xf3ffffff, 0x00000000, 0xffffffff, 0x33fffcff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000000, 0xfffffffc, 0xffcfffff, 0xffcfffff, 
+	0xfff3ffff, 0xffffffff, 0x00000000, 0xffffffff,
+	0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff, 
+	0xffffffff, 0x00c00000, 0xfc3fcfff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000c00,
+	0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff, 
+	0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
+	0x00000000, 0xfffcffcf, 0xffffffff, 0xffffffcf
+};
+
+/****************************************************************************
+* Packs 2 input blocks B, B' into the state using a bitsliced representation.
+* Once the packing process is complete, the 256-bit state consists of 8 
+* 32-bit word and the input blocks bit positioning is as follows:
+*
+* 24 24' 56 56' 88 88' 120 120' | ... | 0 0' 32 32' 64 64' 96 96'
+* 25 25' 57 57' 89 89' 121 121' | ... | 1 1' 33 33' 65 65' 97 97'
+* 26 26' 58 58' 90 90' 122 122' | ... | 2 2' 34 34' 66 66' 98 98'
+* 27 27' 59 59' 91 91' 123 123' | ... | 3 3' 35 35' 67 67' 99 99'
+* 28 28' 60 60' 92 92' 124 124' | ... | 4 4' 36 36' 68 68' 100 100'
+* 29 29' 61 61' 93 93' 125 125' | ... | 5 5' 37 37' 69 69' 101 101'
+* 30 30' 62 62' 94 94' 126 126' | ... | 6 6' 38 38' 70 70' 102 102'
+* 31 31' 63 63' 95 95' 127 127' | ... | 7 7' 39 39' 71 71' 103 103'
+****************************************************************************/
+void packing(u32* out, const u8* block0, const u8* block1) {
+	u32 tmp;
+	LE_LOAD(out, block0);
+	LE_LOAD(out + 1, block1);
+	LE_LOAD(out + 2, block0 + 4);
+	LE_LOAD(out + 3, block1 + 4);
+	LE_LOAD(out + 4, block0 + 8);
+	LE_LOAD(out + 5, block1 + 8);
+	LE_LOAD(out + 6, block0 + 12);
+	LE_LOAD(out + 7, block1 + 12);
+	SWAPMOVE(out[1], out[0], 0x55555555, 1);
+	SWAPMOVE(out[3], out[2], 0x55555555, 1);
+	SWAPMOVE(out[5], out[4], 0x55555555, 1);
+	SWAPMOVE(out[7], out[6], 0x55555555, 1);
+	SWAPMOVE(out[2], out[0], 0x30303030, 2);
+	SWAPMOVE(out[4], out[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[6], out[0], 0x03030303, 6);
+	SWAPMOVE(out[3], out[1], 0x30303030, 2);
+	SWAPMOVE(out[5], out[1], 0x0c0c0c0c, 4);
+	SWAPMOVE(out[7], out[1], 0x03030303, 6);
+	SWAPMOVE(out[4], out[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[6], out[2], 0x03030303, 4);
+	SWAPMOVE(out[5], out[3], 0x0c0c0c0c, 2);
+	SWAPMOVE(out[7], out[3], 0x03030303, 4);
+	SWAPMOVE(out[6], out[4], 0x03030303, 2);
+	SWAPMOVE(out[7], out[5], 0x03030303, 2);
+}
+
+/****************************************************************************
+* Unacks the 256-bit state into the 32-byte output byte array.
+* Once the unpacking process is complete, the byte ordering within the output
+* array is as follows:
+*
+*  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,
+* 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  26,  27,  28,  29,  30,  31
+****************************************************************************/
+void unpacking(u8* out, u8* out_bis, u32 *in) {
+	u32 tmp;
+	SWAPMOVE(in[6], in[4], 0x03030303, 2);
+	SWAPMOVE(in[7], in[5], 0x03030303, 2);
+	SWAPMOVE(in[5], in[3], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[7], in[3], 0x03030303, 4);
+	SWAPMOVE(in[4], in[2], 0x0c0c0c0c, 2);
+	SWAPMOVE(in[6], in[2], 0x03030303, 4);
+	SWAPMOVE(in[7], in[1], 0x03030303, 6);
+	SWAPMOVE(in[5], in[1], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[3], in[1], 0x30303030, 2);
+	SWAPMOVE(in[6], in[0], 0x03030303, 6);
+	SWAPMOVE(in[4], in[0], 0x0c0c0c0c, 4);
+	SWAPMOVE(in[2], in[0], 0x30303030, 2);
+	SWAPMOVE(in[1], in[0], 0x55555555, 1);
+	SWAPMOVE(in[3], in[2], 0x55555555, 1);
+	SWAPMOVE(in[5], in[4], 0x55555555, 1);
+	SWAPMOVE(in[7], in[6], 0x55555555, 1);
+	LE_STORE(out, in[0]);
+	LE_STORE(out_bis, in[1]);
+	LE_STORE(out + 4, in[2]);
+	LE_STORE(out_bis + 4, in[3]);
+	LE_STORE(out + 8, in[4]);
+	LE_STORE(out_bis + 8, in[5]);
+	LE_STORE(out + 12, in[6]);
+	LE_STORE(out_bis + 12, in[7]);
+}
+
+
+//Apply the permutation in a bitsliced manner, twice
+void permute_tk_2(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,14) & 0xcc00cc00;
+		tk[i] |= (tmp & 0x000000ff) << 16;
+		tk[i] |= (tmp & 0xcc000000)>> 2;
+		tk[i] |= (tmp & 0x0033cc00) >> 8;
+		tk[i] |= (tmp & 0x00cc0000) >>18;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 4 times
+void permute_tk_4(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,22) & 0xcc0000cc;
+		tk[i] |= ROR(tmp,16) & 0x3300cc00;
+		tk[i] |= ROR(tmp, 24) & 0x00cc3300;
+		tk[i] |= (tmp & 0x00cc00cc) >> 2;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 6 times
+void permute_tk_6(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,6) & 0xcccc0000;
+		tk[i] |= ROR(tmp,24) & 0x330000cc;
+		tk[i] |= ROR(tmp,10) & 0x3333;
+		tk[i] |= (tmp & 0xcc) << 14;
+		tk[i] |= (tmp & 0x3300) << 2;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 8 times
+void permute_tk_8(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0xcc000033;
+		tk[i] |= ROR(tmp,8) & 0x33cc0000;
+		tk[i] |= ROR(tmp,26) & 0x00333300;
+		tk[i] |= (tmp & 0x00333300) >> 6;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 10 times
+void permute_tk_10(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc330000;
+		tk[i] |= ROR(tmp,26) & 0x33000033;
+		tk[i] |= ROR(tmp,22) & 0x00cccc00;
+		tk[i] |= (tmp & 0x00330000) >> 14;
+		tk[i] |= (tmp & 0xcc00) >> 2;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 12 times
+void permute_tk_12(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,8) & 0xcc33;
+		tk[i] |= ROR(tmp,30) & 0x00cc00cc;
+		tk[i] |= ROR(tmp,10) & 0x33330000;
+		tk[i] |= ROR(tmp,16) & 0xcc003300;
+	}
+}
+
+//Apply the permutation in a bitsliced manner, 14 times
+void permute_tk_14(u32* tk) {
+	u32 tmp;
+	for(int i =0; i < 8; i++) {
+		tmp = tk[i];
+		tk[i] = ROR(tmp,24) & 0x0033cc00; 	//red
+		tk[i] |= ROR(tmp,14) & 0x00cc0000; 	//green
+		tk[i] |= ROR(tmp,30) & 0xcc000000; 	//blue
+		tk[i] |= ROR(tmp,16) & 0x000000ff; 	//yellow
+		tk[i] |= ROR(tmp,18) & 0x33003300; 	//purp
+	}
+}
+
+void precompute_lfsr_tk2(u32* tk, const u8* tk2_0,
+						const u8* tk2_1, const int rounds) {
+	u32 tmp;
+	u32 state[8];
+	packing(state, tk2_0, tk2_1);
+	memcpy(tk, state, 32);
+	for(int i = 0 ; i < rounds; i+=2) {
+		LFSR2(state);
+		memcpy(tk+i*8+8, state, 32);
+	}
+}
+
+void precompute_lfsr_tk3(u32* tk, const u8* tk3_0,
+						const u8* tk3_1, const int rounds) {
+	u32 tmp;
+	u32 state[8];
+	packing(state, tk3_0, tk3_1);
+	for(int i = 0; i < 8; i++)
+		tk[i] ^= state[i];
+	for(int i = 0 ; i < rounds; i+=2) {
+		LFSR3(state);
+		tk[i*8+8] ^= state[0];
+		tk[i*8+9] ^= state[1];
+		tk[i*8+10] ^= state[2];
+		tk[i*8+11] ^= state[3];
+		tk[i*8+12] ^= state[4];
+		tk[i*8+13] ^= state[5];
+		tk[i*8+14] ^= state[6];
+		tk[i*8+15] ^= state[7];
+	}
+}
+
+/****************************************************************************
+* XOR with TK with TK1 before applying the permutations.
+* The key is then rearranged to match the fixsliced representation.
+****************************************************************************/
+void permute_tk(u32* tk, const u8* tk1_0, const u8* tk1_1, const int rounds) {
+	u32 test;
+	u32 tk1[8], tmp[8];
+	packing(tk1, tk1_0, tk1_1);
+	memcpy(tmp, tk, 32);
+	XOR_BLOCK(tmp, tk1);
+	tk[0] = tmp[6] & 0xf0f0f0f0; 			//mask to extract rows 1&2 only
+	tk[1] = tmp[5] & 0xf0f0f0f0;
+	tk[2] = tmp[0] & 0xf0f0f0f0;
+	tk[3] = tmp[1] & 0xf0f0f0f0;
+	tk[4] = tmp[3] & 0xf0f0f0f0;
+	tk[5] = tmp[7] & 0xf0f0f0f0;
+	tk[6] = tmp[4] & 0xf0f0f0f0;
+	tk[7] = tmp[2] & 0xf0f0f0f0;
+	for(int i = 0 ; i < rounds; i+=8) {
+		test = (i % 16 < 8) ? 1 : 0; 			//to apply the right power of P
+		memcpy(tmp, tk+i*8+8, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_2(tmp); 						// applies P^2
+		else
+			permute_tk_10(tmp); 					// applies P^10
+		tk[i*8+8] = ROR(tmp[4],26) & 0xc3c3c3c3; 	//mask to extract rows 1&2 only
+		tk[i*8+9] = ROR(tmp[7],26) & 0xc3c3c3c3; 	//rotation to match fixslicing
+		tk[i*8+10] = ROR(tmp[6],26) & 0xc3c3c3c3;
+		tk[i*8+11] = ROR(tmp[5],26) & 0xc3c3c3c3;
+		tk[i*8+12] = ROR(tmp[1],26) & 0xc3c3c3c3;
+		tk[i*8+13] = ROR(tmp[2],26) & 0xc3c3c3c3;
+		tk[i*8+14] = ROR(tmp[3],26) & 0xc3c3c3c3;
+		tk[i*8+15] = ROR(tmp[0],26) & 0xc3c3c3c3;
+		tk[i*8+16] = ROR(tmp[3],28) & 0x03030303; 	//mask to extract rows 1&2 only
+		tk[i*8+16] |= ROR(tmp[3],12) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+17] = ROR(tmp[2],28) & 0x03030303;
+		tk[i*8+17] |= ROR(tmp[2],12) & 0x0c0c0c0c;
+		tk[i*8+18] = ROR(tmp[4],28) & 0x03030303;
+		tk[i*8+18] |= ROR(tmp[4],12) & 0x0c0c0c0c;
+		tk[i*8+19] = ROR(tmp[7],28) & 0x03030303;
+		tk[i*8+19] |= ROR(tmp[7],12) & 0x0c0c0c0c;
+		tk[i*8+20] = ROR(tmp[5],28) & 0x03030303;
+		tk[i*8+20] |= ROR(tmp[5],12) & 0x0c0c0c0c;
+		tk[i*8+21] = ROR(tmp[0],28) & 0x03030303;
+		tk[i*8+21] |= ROR(tmp[0],12) & 0x0c0c0c0c;
+		tk[i*8+22] = ROR(tmp[1],28) & 0x03030303;
+		tk[i*8+22] |= ROR(tmp[1],12) & 0x0c0c0c0c;
+		tk[i*8+23] = ROR(tmp[6],28) & 0x03030303;
+		tk[i*8+23] |= ROR(tmp[6],12) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*8+24, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_4(tmp); 						// applies P^4
+		else
+			permute_tk_12(tmp); 					// applies P^12
+		tk[i*8+24] = ROR(tmp[1],14) & 0x30303030; 	//mask to extract rows 1&2 only
+		tk[i*8+24] |= ROR(tmp[1],6) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+25] = ROR(tmp[0],14) & 0x30303030;
+		tk[i*8+25] |= ROR(tmp[0],6) & 0x0c0c0c0c;
+		tk[i*8+26] = ROR(tmp[3],14) & 0x30303030;
+		tk[i*8+26] |= ROR(tmp[3],6) & 0x0c0c0c0c;
+		tk[i*8+27] = ROR(tmp[2],14) & 0x30303030;
+		tk[i*8+27] |= ROR(tmp[2],6) & 0x0c0c0c0c;
+		tk[i*8+28] = ROR(tmp[7],14) & 0x30303030;
+		tk[i*8+28] |= ROR(tmp[7],6) & 0x0c0c0c0c;
+		tk[i*8+29] = ROR(tmp[6],14) & 0x30303030;
+		tk[i*8+29] |= ROR(tmp[6],6) & 0x0c0c0c0c;
+		tk[i*8+30] = ROR(tmp[5],14) & 0x30303030;
+		tk[i*8+30] |= ROR(tmp[5],6) & 0x0c0c0c0c;
+		tk[i*8+31] = ROR(tmp[4],14) & 0x30303030;
+		tk[i*8+31] |= ROR(tmp[4],6) & 0x0c0c0c0c;
+		tk[i*8+32] = ROR(tmp[6],16) & 0xf0f0f0f0; 	//mask to extract rows 1&2 only
+		tk[i*8+33] = ROR(tmp[5],16) & 0xf0f0f0f0; 	//rotation to match fixslicing
+		tk[i*8+34] = ROR(tmp[0],16) & 0xf0f0f0f0;
+		tk[i*8+35] = ROR(tmp[1],16) & 0xf0f0f0f0;
+		tk[i*8+36] = ROR(tmp[3],16) & 0xf0f0f0f0;
+		tk[i*8+37] = ROR(tmp[7],16) & 0xf0f0f0f0;
+		tk[i*8+38] = ROR(tmp[4],16) & 0xf0f0f0f0;
+		tk[i*8+39] = ROR(tmp[2],16) & 0xf0f0f0f0;
+		memcpy(tmp, tk+i*8+40, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_6(tmp); 						//	applies P^6
+		else
+			permute_tk_14(tmp); 					// applies P^14
+		tk[i*8+40] = ROR(tmp[4],10) & 0xc3c3c3c3; 	//mask to extract rows 1&2 only
+		tk[i*8+41] = ROR(tmp[7],10) & 0xc3c3c3c3; 	//rotation to match fixslicing
+		tk[i*8+42] = ROR(tmp[6],10) & 0xc3c3c3c3;
+		tk[i*8+43] = ROR(tmp[5],10) & 0xc3c3c3c3;
+		tk[i*8+44] = ROR(tmp[1],10) & 0xc3c3c3c3;
+		tk[i*8+45] = ROR(tmp[2],10) & 0xc3c3c3c3;
+		tk[i*8+46] = ROR(tmp[3],10) & 0xc3c3c3c3;
+		tk[i*8+47] = ROR(tmp[0],10) & 0xc3c3c3c3;
+		tk[i*8+48] = ROR(tmp[3],12) & 0x03030303; 	//mask to extract rows 1&2 only
+		tk[i*8+48] |= ROR(tmp[3],28) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+49] = ROR(tmp[2],12) & 0x03030303;
+		tk[i*8+49] |= ROR(tmp[2],28) & 0x0c0c0c0c;
+		tk[i*8+50] = ROR(tmp[4],12) & 0x03030303;
+		tk[i*8+50] |= ROR(tmp[4],28) & 0x0c0c0c0c;
+		tk[i*8+51] = ROR(tmp[7],12) & 0x03030303;
+		tk[i*8+51] |= ROR(tmp[7],28) & 0x0c0c0c0c;
+		tk[i*8+52] = ROR(tmp[5],12) & 0x03030303;
+		tk[i*8+52] |= ROR(tmp[5],28) & 0x0c0c0c0c;
+		tk[i*8+53] = ROR(tmp[0],12) & 0x03030303;
+		tk[i*8+53] |= ROR(tmp[0],28) & 0x0c0c0c0c;
+		tk[i*8+54] = ROR(tmp[1],12) & 0x03030303;
+		tk[i*8+54] |= ROR(tmp[1],28) & 0x0c0c0c0c;
+		tk[i*8+55] = ROR(tmp[6],12) & 0x03030303;
+		tk[i*8+55] |= ROR(tmp[6],28) & 0x0c0c0c0c;
+		memcpy(tmp, tk+i*8+56, 32);
+		XOR_BLOCK(tmp, tk1);
+		if (test)
+			permute_tk_8(tmp); 						// applies P^8
+		tk[i*8+56] = ROR(tmp[1],30) & 0x30303030; 	//mask to extract rows 1&2 only
+		tk[i*8+56] |= ROR(tmp[1],22) & 0x0c0c0c0c; 	//rotation to match fixslicing
+		tk[i*8+57] = ROR(tmp[0],30) & 0x30303030;
+		tk[i*8+57] |= ROR(tmp[0],22) & 0x0c0c0c0c;
+		tk[i*8+58] = ROR(tmp[3],30) & 0x30303030;
+		tk[i*8+58] |= ROR(tmp[3],22) & 0x0c0c0c0c;
+		tk[i*8+59] = ROR(tmp[2],30) & 0x30303030;
+		tk[i*8+59] |= ROR(tmp[2],22) & 0x0c0c0c0c;
+		tk[i*8+60] = ROR(tmp[7],30) & 0x30303030;
+		tk[i*8+60] |= ROR(tmp[7],22) & 0x0c0c0c0c;
+		tk[i*8+61] = ROR(tmp[6],30) & 0x30303030;
+		tk[i*8+61] |= ROR(tmp[6],22) & 0x0c0c0c0c;
+		tk[i*8+62] = ROR(tmp[5],30) & 0x30303030;
+		tk[i*8+62] |= ROR(tmp[5],22) & 0x0c0c0c0c;
+		tk[i*8+63] = ROR(tmp[4],30) & 0x30303030;
+		tk[i*8+63] |= ROR(tmp[4],22) & 0x0c0c0c0c;
+		//if (test && (i+8 < rounds)) { 				//only if next loop iteration
+		if (i+8 < rounds) { 						//only if next loop iteration
+			tk[i*8+64] = tmp[6] & 0xf0f0f0f0; 		//mask to extract rows 1&2 only
+			tk[i*8+65] = tmp[5] & 0xf0f0f0f0;
+			tk[i*8+66] = tmp[0] & 0xf0f0f0f0;
+			tk[i*8+67] = tmp[1] & 0xf0f0f0f0;
+			tk[i*8+68] = tmp[3] & 0xf0f0f0f0;
+			tk[i*8+69] = tmp[7] & 0xf0f0f0f0;
+			tk[i*8+70] = tmp[4] & 0xf0f0f0f0;
+			tk[i*8+71] = tmp[2] & 0xf0f0f0f0;
+		}
+	}
+}
+
+//Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3, int rounds) {
+	memset(rtk, 0x00, 32*rounds);
+	precompute_lfsr_tk2(rtk, tk2, tk2, rounds);
+	precompute_lfsr_tk3(rtk, tk3, tk3, rounds);
+	permute_tk(rtk, (u8*)(rtk+16), (u8*)(rtk+16), rounds);	// rtk+16 is NULL
+	for(int i = 0; i < rounds; i++) {						// add rconsts
+		for(int j = 0; j < 8; j++)
+			rtk[i*8+j] ^= rconst_32_bs[i*8+j];
+	}
+}
+
+//Precompute TK1
+void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis) {
+	memset(rtk1, 0x00, 32*16);
+	permute_tk(rtk1, tk1, tk1_bis, 16);
+}
--- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h
+++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h
+#ifndef TK_SCHEDULE_BS_H_
+#define TK_SCHEDULE_BS_H_
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+typedef struct {
+	u32 rtk1[8*16];
+	u32 rtk2_3[8*56];
+} tweakey;
+	
+void packing(u32* out, const u8* block0, const u8* block1);
+void unpacking(u8* out, u8* out_bis, u32 *in);
+void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3, int rounds);
+void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
+
+#define LFSR2(tk) ({				\
+	tmp = (tk)[0] ^ (tk)[2];		\
+	(tk)[0] = (tk)[1]; 				\
+	(tk)[1] = (tk)[2];				\
+	(tk)[2] = (tk)[3];				\
+	(tk)[3] = (tk)[4];				\
+	(tk)[4] = (tk)[5];				\
+	(tk)[5] = (tk)[6];				\
+	(tk)[6] = (tk)[7];				\
+	(tk)[7] = tmp;					\
+})
+
+#define LFSR3(tk) ({				\
+	tmp = (tk)[7] ^ (tk)[1]; 		\
+	(tk)[7] = (tk)[6];				\
+	(tk)[6] = (tk)[5];				\
+	(tk)[5] = (tk)[4];				\
+	(tk)[4] = (tk)[3];				\
+	(tk)[3] = (tk)[2];				\
+	(tk)[2] = (tk)[1];				\
+	(tk)[1] = (tk)[0];				\
+	(tk)[0] = tmp;					\
+})
+
+#define XOR_BLOCK(x,y) ({ 			\
+	(x)[0] ^= (y)[0];				\
+	(x)[1] ^= (y)[1];				\
+	(x)[2] ^= (y)[2];				\
+	(x)[3] ^= (y)[3];				\
+	(x)[4] ^= (y)[4];				\
+	(x)[5] ^= (y)[5];				\
+	(x)[6] ^= (y)[6];				\
+	(x)[7] ^= (y)[7];				\
+})
+
+#define SWAPMOVE(a, b, mask, n)	({	\
+	tmp = (b ^ (a >> n)) & mask;	\
+	b ^= tmp;						\
+	a ^= (tmp << n);				\
+})
+
+#define LE_LOAD(x, y) 				\
+	*(x) = (((u32)(y)[3] << 24) | 	\
+		((u32)(y)[2] << 16) 	| 	\
+		((u32)(y)[1] << 8) 		| 	\
+		(y)[0]);
+
+#define LE_STORE(x, y)				\
+	(x)[0] = (y) & 0xff; 			\
+	(x)[1] = ((y) >> 8) & 0xff; 	\
+	(x)[2] = ((y) >> 16) & 0xff; 	\
+	(x)[3] = (y) >> 24; 
+
+#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
+
+#endif  // TK_SCHEDULE_BS_H_
\ No newline at end of file