Merge branch 'email-submissions'

121de979 · Enrico Pozzobon · 20e3a3f4 · b9419420 · 121de979 · 121de979
Commit 121de979 authored Jun 01, 2020 by Enrico Pozzobon
47 changed files
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.c
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "aead-common.h"
+
+int aead_check_tag
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned size)
+{
+    /* Set "accum" to -1 if the tags match, or 0 if they don't match */
+    int accum = 0;
+    while (size > 0) {
+        accum |= (*tag1++ ^ *tag2++);
+        --size;
+    }
+    accum = (accum - 1) >> 8;
+
+    /* Destroy the plaintext if the tag match failed */
+    while (plaintext_len > 0) {
+        *plaintext++ &= accum;
+        --plaintext_len;
+    }
+
+    /* If "accum" is 0, return -1, otherwise return 0 */
+    return ~accum;
+}
+
+int aead_check_tag_precheck
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned size, int precheck)
+{
+    /* Set "accum" to -1 if the tags match, or 0 if they don't match */
+    int accum = 0;
+    while (size > 0) {
+        accum |= (*tag1++ ^ *tag2++);
+        --size;
+    }
+    accum = ((accum - 1) >> 8) & precheck;
+
+    /* Destroy the plaintext if the tag match failed */
+    while (plaintext_len > 0) {
+        *plaintext++ &= accum;
+        --plaintext_len;
+    }
+
+    /* If "accum" is 0, return -1, otherwise return 0 */
+    return ~accum;
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LWCRYPTO_AEAD_COMMON_H
+#define LWCRYPTO_AEAD_COMMON_H
+
+#include <stddef.h>
+
+/**
+ * \file aead-common.h
+ * \brief Definitions that are common across AEAD schemes.
+ *
+ * AEAD stands for "Authenticated Encryption with Associated Data".
+ * It is a standard API pattern for securely encrypting and
+ * authenticating packets of data.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief Encrypts and authenticates a packet with an AEAD scheme.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - normally not used by AEAD schemes.
+ * \param npub Points to the public nonce for the packet.
+ * \param k Points to the key to use to encrypt the packet.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ */
+typedef int (*aead_cipher_encrypt_t)
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with an AEAD scheme.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - normally not used by AEAD schemes.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet.
+ * \param k Points to the key to use to decrypt the packet.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ */
+typedef int (*aead_cipher_decrypt_t)
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Hashes a block of input data.
+ *
+ * \param out Buffer to receive the hash output.
+ * \param in Points to the input data to be hashed.
+ * \param inlen Length of the input data in bytes.
+ *
+ * \return Returns zero on success or -1 if there was an error in the
+ * parameters.
+ */
+typedef int (*aead_hash_t)
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Initializes the state for a hashing operation.
+ *
+ * \param state Hash state to be initialized.
+ */
+typedef void (*aead_hash_init_t)(void *state);
+
+/**
+ * \brief Updates a hash state with more input data.
+ *
+ * \param state Hash state to be updated.
+ * \param in Points to the input data to be incorporated into the state.
+ * \param inlen Length of the input data to be incorporated into the state.
+ */
+typedef void (*aead_hash_update_t)
+    (void *state, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Returns the final hash value from a hashing operation.
+ *
+ * \param Hash state to be finalized.
+ * \param out Points to the output buffer to receive the hash value.
+ */
+typedef void (*aead_hash_finalize_t)(void *state, unsigned char *out);
+
+/**
+ * \brief Aborbs more input data into an XOF state.
+ *
+ * \param state XOF state to be updated.
+ * \param in Points to the input data to be absorbed into the state.
+ * \param inlen Length of the input data to be absorbed into the state.
+ *
+ * \sa ascon_xof_init(), ascon_xof_squeeze()
+ */
+typedef void (*aead_xof_absorb_t)
+    (void *state, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Squeezes output data from an XOF state.
+ *
+ * \param state XOF state to squeeze the output data from.
+ * \param out Points to the output buffer to receive the squeezed data.
+ * \param outlen Number of bytes of data to squeeze out of the state.
+ */
+typedef void (*aead_xof_squeeze_t)
+    (void *state, unsigned char *out, unsigned long long outlen);
+
+/**
+ * \brief No special AEAD features.
+ */
+#define AEAD_FLAG_NONE          0x0000
+
+/**
+ * \brief The natural byte order of the AEAD cipher is little-endian.
+ *
+ * If this flag is not present, then the natural byte order of the
+ * AEAD cipher should be assumed to be big-endian.
+ *
+ * The natural byte order may be useful when formatting packet sequence
+ * numbers as nonces.  The application needs to know whether the sequence
+ * number should be packed into the leading or trailing bytes of the nonce.
+ */
+#define AEAD_FLAG_LITTLE_ENDIAN 0x0001
+
+/**
+ * \brief Meta-information about an AEAD cipher.
+ */
+typedef struct
+{
+    const char *name;               /**< Name of the cipher */
+    unsigned key_len;               /**< Length of the key in bytes */
+    unsigned nonce_len;             /**< Length of the nonce in bytes */
+    unsigned tag_len;               /**< Length of the tag in bytes */
+    unsigned flags;                 /**< Flags for extra features */
+    aead_cipher_encrypt_t encrypt;  /**< AEAD encryption function */
+    aead_cipher_decrypt_t decrypt;  /**< AEAD decryption function */
+    unsigned char *expected;        /**< AEAD encryption benchmark expected result */
+} aead_cipher_t;
+
+/**
+ * \brief Meta-information about a hash algorithm that is related to an AEAD.
+ *
+ * Regular hash algorithms should provide the "hash", "init", "update",
+ * and "finalize" functions.  Extensible Output Functions (XOF's) should
+ * proivde the "hash", "init", "absorb", and "squeeze" functions.
+ */
+typedef struct
+{
+    const char *name;           /**< Name of the hash algorithm */
+    size_t state_size;          /**< Size of the incremental state structure */
+    unsigned hash_len;          /**< Length of the hash in bytes */
+    unsigned flags;             /**< Flags for extra features */
+    aead_hash_t hash;           /**< All in one hashing function */
+    aead_hash_init_t init;      /**< Incremental hash/XOF init function */
+    aead_hash_update_t update;  /**< Incremental hash update function */
+    aead_hash_finalize_t finalize; /**< Incremental hash finalize function */
+    aead_xof_absorb_t absorb;   /**< Incremental XOF absorb function */
+    aead_xof_squeeze_t squeeze; /**< Incremental XOF squeeze function */
+
+} aead_hash_algorithm_t;
+
+/**
+ * \brief Check an authentication tag in constant time.
+ *
+ * \param plaintext Points to the plaintext data.
+ * \param plaintext_len Length of the plaintext in bytes.
+ * \param tag1 First tag to compare.
+ * \param tag2 Second tag to compare.
+ * \param tag_len Length of the tags in bytes.
+ *
+ * \return Returns -1 if the tag check failed or 0 if the check succeeded.
+ *
+ * If the tag check fails, then the \a plaintext will also be zeroed to
+ * prevent it from being used accidentally by the application when the
+ * ciphertext was invalid.
+ */
+int aead_check_tag
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned tag_len);
+
+/**
+ * \brief Check an authentication tag in constant time with a previous check.
+ *
+ * \param plaintext Points to the plaintext data.
+ * \param plaintext_len Length of the plaintext in bytes.
+ * \param tag1 First tag to compare.
+ * \param tag2 Second tag to compare.
+ * \param tag_len Length of the tags in bytes.
+ * \param precheck Set to -1 if previous check succeeded or 0 if it failed.
+ *
+ * \return Returns -1 if the tag check failed or 0 if the check succeeded.
+ *
+ * If the tag check fails, then the \a plaintext will also be zeroed to
+ * prevent it from being used accidentally by the application when the
+ * ciphertext was invalid.
+ *
+ * This version can be used to incorporate other information about the
+ * correctness of the plaintext into the final result.
+ */
+int aead_check_tag_precheck
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned tag_len, int precheck);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/api.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.c
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "drygascon.h"
+#include "internal-drysponge.h"
+#include <string.h>
+
+uint8_t drygascon128k32_expected[DRYGASCON128_TAG_SIZE]={0x66,0x5A,0xDE,0x6C,0x0F,0xBD,0x48,0x8C,0x5E,0xA4,0x77,0x5D,0xD6,0x24,0xDA,0xD7};
+
+uint8_t drygascon128k56_expected[DRYGASCON128_TAG_SIZE]={0x7B,0x8B,0x9D,0x58,0xA7,0xF7,0x5F,0x1E,0x56,0x99,0x46,0xD6,0x24,0xC4,0xF7,0x68};
+
+uint8_t drygascon128k16_expected[DRYGASCON128_TAG_SIZE]={0x14,0xA5,0x21,0x17,0xFF,0x52,0x4F,0x7C,0xCB,0xB3,0xEB,0xE4,0x05,0xEF,0x18,0xA4};
+
+const aead_cipher_t const drygascon128k32_cipher = {
+    "DryGASCON128k32",
+    DRYGASCON128_FASTKEY_SIZE,
+    DRYGASCON128_NONCE_SIZE,
+    DRYGASCON128_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128k32_aead_encrypt,
+    drygascon128k32_aead_decrypt,
+    drygascon128k32_expected
+};
+
+const aead_cipher_t const drygascon128_cipher = {
+	"DryGASCON128k32",
+	DRYGASCON128_FASTKEY_SIZE,
+	DRYGASCON128_NONCE_SIZE,
+	DRYGASCON128_TAG_SIZE,
+	AEAD_FLAG_LITTLE_ENDIAN,
+	drygascon128k32_aead_encrypt,
+	drygascon128k32_aead_decrypt,
+	drygascon128k32_expected
+};
+
+const aead_cipher_t const drygascon128k56_cipher = {
+    "DryGASCON128k56",
+    DRYGASCON128_SAFEKEY_SIZE,
+    DRYGASCON128_NONCE_SIZE,
+    DRYGASCON128_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128k56_aead_encrypt,
+    drygascon128k56_aead_decrypt,
+    drygascon128k56_expected
+};
+
+const aead_cipher_t const drygascon128k16_cipher = {
+    "DryGASCON128k16",
+    DRYGASCON128_MINKEY_SIZE,
+    DRYGASCON128_NONCE_SIZE,
+    DRYGASCON128_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128k16_aead_encrypt,
+    drygascon128k16_aead_decrypt,
+    drygascon128k16_expected
+};
+
+aead_cipher_t const drygascon256_cipher = {
+    "DryGASCON256",
+    DRYGASCON256_KEY_SIZE,
+    DRYGASCON256_NONCE_SIZE,
+    DRYGASCON256_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon256_aead_encrypt,
+    drygascon256_aead_decrypt
+};
+
+aead_hash_algorithm_t const drygascon128_hash_algorithm = {
+    "DryGASCON128-HASH",
+    sizeof(int),
+    DRYGASCON128_HASH_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128_hash,
+    (aead_hash_init_t)0,
+    (aead_hash_update_t)0,
+    (aead_hash_finalize_t)0,
+    (aead_xof_absorb_t)0,
+    (aead_xof_squeeze_t)0
+};
+
+aead_hash_algorithm_t const drygascon256_hash_algorithm = {
+    "DryGASCON256-HASH",
+    sizeof(int),
+    DRYGASCON256_HASH_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon256_hash,
+    (aead_hash_init_t)0,
+    (aead_hash_update_t)0,
+    (aead_hash_finalize_t)0,
+    (aead_xof_absorb_t)0,
+    (aead_xof_squeeze_t)0
+};
+
+/**
+ * \brief Processes associated data for DryGASCON128.
+ *
+ * \param state DrySPONGE128 sponge state.
+ * \param ad Points to the associated data.
+ * \param adlen Length of the associated data, must not be zero.
+ * \param finalize Non-zero to finalize packet processing because
+ * the message is zero-length.
+ */
+static void drygascon128_process_ad
+    (drysponge128_state_t *state, const unsigned char *ad,
+     unsigned long long adlen, int finalize)
+{
+    /* Process all blocks except the last one */
+    while (adlen > DRYSPONGE128_RATE) {
+        drygascon128_f_wrap(state, ad, DRYSPONGE128_RATE);
+        //drysponge128_g_core(state);
+        ad += DRYSPONGE128_RATE;
+        adlen -= DRYSPONGE128_RATE;
+    }
+
+    /* Process the last block with domain separation and padding */
+    state->domain = DRYDOMAIN128_ASSOC_DATA;
+    if (finalize)
+        state->domain |= DRYDOMAIN128_FINAL;
+    if (adlen < DRYSPONGE128_RATE)
+        state->domain |= DRYDOMAIN128_PADDED;
+    drygascon128_f_wrap(state, ad, (unsigned)adlen);
+    //drysponge128_g(state);
+}
+
+/**
+ * \brief Processes associated data for DryGASCON256.
+ *
+ * \param state DrySPONGE256 sponge state.
+ * \param ad Points to the associated data.
+ * \param adlen Length of the associated data, must not be zero.
+ * \param finalize Non-zero to finalize packet processing because
+ * the message is zero-length.
+ */
+static void drygascon256_process_ad
+    (drysponge256_state_t *state, const unsigned char *ad,
+     unsigned long long adlen, int finalize)
+{
+    /* Process all blocks except the last one */
+    while (adlen > DRYSPONGE256_RATE) {
+        drysponge256_f_absorb(state, ad, DRYSPONGE256_RATE);
+        drysponge256_g_core(state);
+        ad += DRYSPONGE256_RATE;
+        adlen -= DRYSPONGE256_RATE;
+    }
+
+    /* Process the last block with domain separation and padding */
+    state->domain = DRYDOMAIN256_ASSOC_DATA;
+    if (finalize)
+        state->domain |= DRYDOMAIN256_FINAL;
+    if (adlen < DRYSPONGE256_RATE)
+        state->domain |= DRYDOMAIN256_PADDED;
+    drysponge256_f_absorb(state, ad, (unsigned)adlen);
+    drysponge256_g(state);
+}
+
+int drygascon128_aead_encrypt_core
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+	 unsigned int keysize,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge128_state_t state;
+    unsigned temp;
+
+    /* Check we are safe */
+	if(!drysponge128_safe_alignement(&state)){
+		return -1;
+	}
+
+    /* Set the length of the returned ciphertext */
+    *clen = mlen + DRYGASCON128_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    drysponge128_setup(&state, k, keysize, npub, adlen == 0 && mlen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon128_process_ad(&state, ad, adlen, mlen == 0);
+
+    /* Encrypt the plaintext to produce the ciphertext */
+    if (mlen > 0) {
+        /* Processs all blocks except the last one */
+        while (mlen > DRYSPONGE128_RATE) {
+            lw_xor_block_2_src(c, m, state.r.B, DRYSPONGE128_RATE);
+            drygascon128_f_wrap(&state, m, DRYSPONGE128_RATE);
+            c += DRYSPONGE128_RATE;
+            m += DRYSPONGE128_RATE;
+            mlen -= DRYSPONGE128_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN128_MESSAGE | DRYDOMAIN128_FINAL;
+        if (mlen < DRYSPONGE128_RATE)
+            state.domain |= DRYDOMAIN128_PADDED;
+        temp = (unsigned)mlen;
+        lw_xor_block_2_src(c, m, state.r.B, temp);
+        drygascon128_f_wrap(&state, m, temp);
+        c += temp;
+    }
+
+    /* Generate the authentication tag */
+    memcpy(c, state.r.B, DRYGASCON128_TAG_SIZE);
+    return 0;
+}
+
+int drygascon128_aead_decrypt_core
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned int keysize,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge128_state_t state;
+    unsigned char *mtemp = m;
+    unsigned temp;
+
+    /* Check we are safe */
+    if(!drysponge128_safe_alignement(&state)){
+		return -1;
+	}
+
+    /* Validate the ciphertext length and set the return "mlen" value */
+    if (clen < DRYGASCON128_TAG_SIZE)
+        return -1;
+    *mlen = clen - DRYGASCON128_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    clen -= DRYGASCON128_TAG_SIZE;
+    drysponge128_setup(&state, k, keysize, npub, adlen == 0 && clen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon128_process_ad(&state, ad, adlen, clen == 0);
+
+    /* Decrypt the ciphertext to produce the plaintext */
+    if (clen > 0) {
+        /* Processs all blocks except the last one */
+        while (clen > DRYSPONGE128_RATE) {
+            lw_xor_block_2_src(m, c, state.r.B, DRYSPONGE128_RATE);
+            drygascon128_f_wrap(&state, m, DRYSPONGE128_RATE);
+            //drysponge128_g(&state);
+            c += DRYSPONGE128_RATE;
+            m += DRYSPONGE128_RATE;
+            clen -= DRYSPONGE128_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN128_MESSAGE | DRYDOMAIN128_FINAL;
+        if (clen < DRYSPONGE128_RATE)
+            state.domain |= DRYDOMAIN128_PADDED;
+        temp = (unsigned)clen;
+        lw_xor_block_2_src(m, c, state.r.B, temp);
+        drygascon128_f_wrap(&state, m, temp);
+        //drysponge128_g(&state);
+        c += temp;
+    }
+
+    /* Check the authentication tag */
+    return aead_check_tag(mtemp, *mlen, state.r.B, c, DRYGASCON128_TAG_SIZE);
+}
+
+int drygascon128k16_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_encrypt_core(c,clen,m,mlen,ad,adlen,16,npub,k);
+}
+
+int drygascon128k32_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_encrypt_core(c,clen,m,mlen,ad,adlen,32,npub,k);
+}
+
+int drygascon128k56_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_encrypt_core(c,clen,m,mlen,ad,adlen,56,npub,k);
+}
+
+
+int drygascon128k16_aead_decrypt
+	(unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_decrypt_core(m,mlen,16,c,clen,ad,adlen,npub,k);
+}
+
+int drygascon128k32_aead_decrypt
+	(unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_decrypt_core(m,mlen,32,c,clen,ad,adlen,npub,k);
+}
+
+int drygascon128k56_aead_decrypt
+	(unsigned char *m, unsigned long long *mlen,
+	 unsigned char *nsec,
+	 const unsigned char *c, unsigned long long clen,
+	 const unsigned char *ad, unsigned long long adlen,
+	 const unsigned char *npub,
+	 const unsigned char *k){
+	return drygascon128_aead_decrypt_core(m,mlen,56,c,clen,ad,adlen,npub,k);
+}
+
+int drygascon256_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge256_state_t state;
+    unsigned temp;
+    (void)nsec;
+
+    /* Set the length of the returned ciphertext */
+    *clen = mlen + DRYGASCON256_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    drysponge256_setup(&state, k, npub, adlen == 0 && mlen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon256_process_ad(&state, ad, adlen, mlen == 0);
+
+    /* Encrypt the plaintext to produce the ciphertext */
+    if (mlen > 0) {
+        /* Processs all blocks except the last one */
+        while (mlen > DRYSPONGE256_RATE) {
+            drysponge256_f_absorb(&state, m, DRYSPONGE256_RATE);
+            lw_xor_block_2_src(c, m, state.r.B, DRYSPONGE256_RATE);
+            drysponge256_g(&state);
+            c += DRYSPONGE256_RATE;
+            m += DRYSPONGE256_RATE;
+            mlen -= DRYSPONGE256_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN256_MESSAGE | DRYDOMAIN256_FINAL;
+        if (mlen < DRYSPONGE256_RATE)
+            state.domain |= DRYDOMAIN256_PADDED;
+        temp = (unsigned)mlen;
+        drysponge256_f_absorb(&state, m, temp);
+        lw_xor_block_2_src(c, m, state.r.B, temp);
+        drysponge256_g(&state);
+        c += temp;
+    }
+
+    /* Generate the authentication tag */
+    memcpy(c, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(c + 16, state.r.B, 16);
+    return 0;
+}
+
+int drygascon256_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge256_state_t state;
+    unsigned char *mtemp = m;
+    unsigned temp;
+    int result;
+    (void)nsec;
+
+    /* Validate the ciphertext length and set the return "mlen" value */
+    if (clen < DRYGASCON256_TAG_SIZE)
+        return -1;
+    *mlen = clen - DRYGASCON256_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    clen -= DRYGASCON256_TAG_SIZE;
+    drysponge256_setup(&state, k, npub, adlen == 0 && clen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon256_process_ad(&state, ad, adlen, clen == 0);
+
+    /* Decrypt the ciphertext to produce the plaintext */
+    if (clen > 0) {
+        /* Processs all blocks except the last one */
+        while (clen > DRYSPONGE256_RATE) {
+            lw_xor_block_2_src(m, c, state.r.B, DRYSPONGE256_RATE);
+            drysponge256_f_absorb(&state, m, DRYSPONGE256_RATE);
+            drysponge256_g(&state);
+            c += DRYSPONGE256_RATE;
+            m += DRYSPONGE256_RATE;
+            clen -= DRYSPONGE256_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN256_MESSAGE | DRYDOMAIN256_FINAL;
+        if (clen < DRYSPONGE256_RATE)
+            state.domain |= DRYDOMAIN256_PADDED;
+        temp = (unsigned)clen;
+        lw_xor_block_2_src(m, c, state.r.B, temp);
+        drysponge256_f_absorb(&state, m, temp);
+        drysponge256_g(&state);
+        c += temp;
+    }
+
+    /* Check the authentication tag which is split into two pieces */
+    result = aead_check_tag(0, 0, state.r.B, c, 16);
+    drysponge256_g(&state);
+    return aead_check_tag_precheck
+        (mtemp, *mlen, state.r.B, c + 16, 16, ~result);
+}
+
+/**
+ * \brief Precomputed initialization vector for DryGASCON128-HASH.
+ *
+ * This is the CST_H value from the DryGASCON specification after it
+ * has been processed by the key setup function for DrySPONGE128.
+ */
+static unsigned char const drygascon128_hash_init[] = {
+    /* c */
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    /* x */
+    0xa4, 0x09, 0x38, 0x22, 0x29, 0x9f, 0x31, 0xd0,
+    0x08, 0x2e, 0xfa, 0x98, 0xec, 0x4e, 0x6c, 0x89
+};
+
+int drygascon128_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen)
+{
+    drysponge128_state_t state;
+    memcpy(state.c.B, drygascon128_hash_init, sizeof(state.c.B));
+    memcpy(state.x.B, drygascon128_hash_init + sizeof(state.c.B),
+           sizeof(state.x.B));
+    state.domain = 0;
+    state.rounds = DRYSPONGE128_ROUNDS;
+    drygascon128_process_ad(&state, in, inlen, 1);
+    memcpy(out, state.r.B, 16);
+    drysponge128_g(&state);
+    memcpy(out + 16, state.r.B, 16);
+    return 0;
+}
+
+/**
+ * \brief Precomputed initialization vector for DryGASCON256-HASH.
+ *
+ * This is the CST_H value from the DryGASCON specification after it
+ * has been processed by the key setup function for DrySPONGE256.
+ */
+static unsigned char const drygascon256_hash_init[] = {
+    /* c */
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0xa4, 0x09, 0x38, 0x22, 0x29, 0x9f, 0x31, 0xd0,
+    0x08, 0x2e, 0xfa, 0x98, 0xec, 0x4e, 0x6c, 0x89,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0xa4, 0x09, 0x38, 0x22, 0x29, 0x9f, 0x31, 0xd0,
+    0x08, 0x2e, 0xfa, 0x98, 0xec, 0x4e, 0x6c, 0x89,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    /* x */
+    0x45, 0x28, 0x21, 0xe6, 0x38, 0xd0, 0x13, 0x77,
+    0xbe, 0x54, 0x66, 0xcf, 0x34, 0xe9, 0x0c, 0x6c
+};
+
+int drygascon256_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen)
+{
+    drysponge256_state_t state;
+    memcpy(state.c.B, drygascon256_hash_init, sizeof(state.c.B));
+    memcpy(state.x.B, drygascon256_hash_init + sizeof(state.c.B),
+           sizeof(state.x.B));
+    state.domain = 0;
+    state.rounds = DRYSPONGE256_ROUNDS;
+    drygascon256_process_ad(&state, in, inlen, 1);
+    memcpy(out, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(out + 16, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(out + 32, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(out + 48, state.r.B, 16);
+    return 0;
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LWCRYPTO_DRYGASCON_H
+#define LWCRYPTO_DRYGASCON_H
+
+#include "aead-common.h"
+
+/**
+ * \file drygascon.h
+ * \brief DryGASCON authenticated encryption algorithm.
+ *
+ * DryGASCON is a family of authenticated encryption algorithms based
+ * around a generalised version of the ASCON permutation.  DryGASCON
+ * is designed to provide some protection against power analysis.
+ *
+ * There are four algorithms in the DryGASCON family:
+ *
+ * \li DryGASCON128 is an authenticated encryption algorithm with a
+ * 128-bit key, a 128-bit nonce, and a 128-bit authentication tag.
+ * \li DryGASCON256 is an authenticated encryption algorithm with a
+ * 256-bit key, a 128-bit nonce, and a 128-256 authentication tag.
+ * \li DryGASCON128-HASH is a hash algorithm with a 256-bit output.
+ * \li DryGASCON256-HASH is a hash algorithm with a 512-bit output.
+ *
+ * DryGASCON128 and DryGASCON128-HASH are the primary members of the family.
+ *
+ * References: https://github.com/sebastien-riou/DryGASCON
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief Minimum Size of the key for DryGASCON128.
+ */
+#define DRYGASCON128_MINKEY_SIZE 16
+
+/**
+ * \brief Fast Size of the key for DryGASCON128.
+ */
+#define DRYGASCON128_FASTKEY_SIZE 32
+
+/**
+ * \brief Safe (and fast) Size of the key for DryGASCON128.
+ * Safe here means the size of the key helps prevent SPA during key loading
+ */
+#define DRYGASCON128_SAFEKEY_SIZE 56
+
+/**
+ * \brief Size of the key for DryGASCON128 (default to "fast" size).
+ */
+#define DRYGASCON128_KEY_SIZE DRYGASCON128_FASTKEY_SIZE
+
+/**
+ * \brief Size of the authentication tag for DryGASCON128.
+ */
+#define DRYGASCON128_TAG_SIZE 16
+
+/**
+ * \brief Size of the nonce for DryGASCON128.
+ */
+#define DRYGASCON128_NONCE_SIZE 16
+
+/**
+ * \brief Size of the hash output for DryGASCON128-HASH.
+ */
+#define DRYGASCON128_HASH_SIZE 32
+
+/**
+ * \brief Size of the key for DryGASCON256.
+ */
+#define DRYGASCON256_KEY_SIZE 32
+
+/**
+ * \brief Size of the authentication tag for DryGASCON256.
+ */
+#define DRYGASCON256_TAG_SIZE 32
+
+/**
+ * \brief Size of the nonce for DryGASCON256.
+ */
+#define DRYGASCON256_NONCE_SIZE 16
+
+/**
+ * \brief Size of the hash output for DryGASCON256-HASH.
+ */
+#define DRYGASCON256_HASH_SIZE 64
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher with 32 bytes key.
+ */
+extern aead_cipher_t const drygascon128k32_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher with 56 bytes key.
+ */
+extern aead_cipher_t const drygascon128k56_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher with 16 bytes key.
+ */
+extern aead_cipher_t const drygascon128k16_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher (default to 32 bytes key).
+ */
+extern aead_cipher_t const drygascon128_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON256 cipher.
+ */
+extern aead_cipher_t const drygascon256_cipher;
+
+/**
+ * \brief Meta-information block for DryGASCON128-HASH.
+ */
+extern aead_hash_algorithm_t const drygascon128_hash_algorithm;
+
+/**
+ * \brief Meta-information block for DryGASCON256-HASH.
+ */
+extern aead_hash_algorithm_t const drygascon256_hash_algorithm;
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON128 with 32 bytes key.
+ *
+ *	Use this key size if SPA attacks are not a concern in your use case.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 32 bytes of the key to use to encrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon128k32_aead_decrypt()
+ */
+int drygascon128k32_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON128 with 32 bytes key.
+ *
+ *	Use this key size if SPA attacks are not a concern in your use case.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 32 bytes of the key to use to decrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon128k32_aead_encrypt()
+ */
+int drygascon128k32_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON128 with 56 bytes key.
+ *
+ *	Use this key size if you want to prevent SPA attacks
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 56 bytes of the key to use to encrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon128k56_aead_decrypt()
+ */
+int drygascon128k56_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON128 with 56 bytes key.
+ *
+ *	Use this key size if you want to prevent SPA attacks
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 56 bytes of the key to use to decrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon128k56_aead_encrypt()
+ */
+int drygascon128k56_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON128 with 16 bytes key.
+ *
+ *	Use this key size only if you really cannot use the 32 bytes key.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to encrypt the packet.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon128k16_aead_decrypt()
+ */
+int drygascon128k16_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON128 with 16 bytes key.
+ *
+ *	Use this key size only if you really cannot use the 32 bytes key.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to decrypt the packet.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon128k16_aead_encrypt()
+ */
+int drygascon128k16_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON256.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to encrypt the packet.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon256_aead_decrypt()
+ */
+int drygascon256_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON256.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to decrypt the packet.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon256_aead_encrypt()
+ */
+int drygascon256_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Hashes a block of input data with DRYGASCON128.
+ *
+ * \param out Buffer to receive the hash output which must be at least
+ * DRYGASCON128_HASH_SIZE bytes in length.
+ * \param in Points to the input data to be hashed.
+ * \param inlen Length of the input data in bytes.
+ *
+ * \return Returns zero on success or -1 if there was an error in the
+ * parameters.
+ */
+int drygascon128_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Hashes a block of input data with DRYGASCON256.
+ *
+ * \param out Buffer to receive the hash output which must be at least
+ * DRYGASCON256_HASH_SIZE bytes in length.
+ * \param in Points to the input data to be hashed.
+ * \param inlen Length of the input data in bytes.
+ *
+ * \return Returns zero on success or -1 if there was an error in the
+ * parameters.
+ */
+int drygascon256_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
+/**
+DryGascon128 'v6m implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M0 (Size and Speed)
+*/
+
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m0
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v6m
+.global	drygascon128_f_v6m
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+.type	drygascon128_g_v6m, %function
+drygascon128_g_v6m:
+	//r0: state: c,r,x
+	//r1: rounds
+	push	{r4, r5, r6, r7, lr}
+	//stack vars:
+    // 8 round
+	// 4 rounds
+	// 0 state address
+
+    //r=0
+    movs    r5,#0
+    str     r5,[r0,#R32_0]
+    str     r5,[r0,#R32_1]
+    str     r5,[r0,#R32_2]
+    str     r5,[r0,#R32_3]
+
+    //round=r5=rounds-1;
+    subs    r6,r1,#1
+    //base = round_cst+12-rounds
+    adr		r5, round_cst
+    adds    r5,r5,#12
+    subs    r5,r5,r1
+
+    push	{r0,r5,r6}
+
+	ldr		r4,[r0,#C4L]
+	ldr		r3,[r0,#C3L]
+	ldr		r2,[r0,#C2L]
+	ldr		r1,[r0,#C1L]
+	ldr		r0,[r0,#C0L]
+
+    //loop entry
+	//assume r1>0 at entry
+drygascon128_g_v6m_main_loop:
+    //r0~r4: lower half of each words of the state
+    //r5: base for round constants
+    //r6: round, counting from rounds-1 to 0
+
+    //r6 = ((0xf - r6) << 4) | r6;
+    ldrb 	r6,[r5,r6]
+        // addition of round constant
+    //r2 ^= r6;
+    eors	r2,r2,r6
+
+    // substitution layer, lower half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    ldr		r7,[sp,#0]
+    str		r4,[r7,#C4L]
+    str		r3,[r7,#C3L]
+    str		r2,[r7,#C2L]
+    str		r1,[r7,#C1L]
+    str		r0,[r7,#C0L]
+
+    ldr		r4,[r7,#C4H]
+    ldr		r3,[r7,#C3H]
+    ldr		r2,[r7,#C2H]
+    ldr		r1,[r7,#C1H]
+    ldr		r0,[r7,#C0H]
+
+    // substitution layer, upper half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    // linear diffusion layer
+    ldr		r7,[sp,#0]
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    movs    r6,r4
+    movs    r5,#(20)
+    rors    r4,r4,r5
+    eors    r6,r6,r4
+    ldr     r5,[r7,#C4L]
+    movs    r7,#(4)
+    rors    r5,r5,r7
+    eors    r6,r6,r5
+    ldr		r7,[sp,#0]
+    str     r6,[r7,#C4H]
+    //c4 low part
+    movs    r7,#(32-4)
+    rors    r5,r5,r7
+    movs    r6,r5
+    movs    r7,#((32-20+3)%32)
+    rors    r4,r4,r7
+    eors    r4,r4,r6
+    movs    r7,#(20)
+    rors    r5,r5,r7
+    eors    r4,r4,r5
+    ldr		r7,[sp,#0]
+    str     r4,[r7,#C4L]
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    movs    r6,r0
+    movs    r5,#(14)
+    rors    r0,r0,r5
+    eors    r6,r6,r0
+    ldr     r5,[r7,#C0L]
+    movs    r4,#(10)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C0H]
+    ldr    r4,[r7,#R32_1]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_1]
+    //c0 low part
+    movs    r4,#(32-10)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-14+9)%32)
+    rors    r0,r0,r4
+    eors    r0,r0,r6
+    movs    r4,#(14)
+    rors    r5,r5,r4
+    eors    r0,r0,r5
+    ldr    r4,[r7,#R32_0]
+    eors    r4,r4,r0
+    str    r4,[r7,#R32_0]
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    movs    r6,r1
+    movs    r5,#(19)
+    rors    r1,r1,r5
+    eors    r6,r6,r1
+    ldr     r5,[r7,#C1L]
+    movs    r4,#(31)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C1H]
+    ldr    r4,[r7,#R32_3]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_3]
+    //c1 low part
+    movs    r4,#(32-31)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-19+30)%32)
+    rors    r1,r1,r4
+    eors    r1,r1,r6
+    movs    r4,#(19)
+    rors    r5,r5,r4
+    eors    r1,r1,r5
+    ldr    r4,[r7,#R32_2]
+    eors    r4,r4,r1
+    str    r4,[r7,#R32_2]
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    movs    r6,r2
+    movs    r5,#(3)
+    rors    r2,r2,r5
+    eors    r6,r6,r2
+    ldr     r5,[r7,#C2L]
+    movs    r4,#(1)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C2H]
+    ldr    r4,[r7,#R32_0]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_0]
+    //c2 low part
+    movs    r4,#(32-1)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-3+0)%32)
+    rors    r2,r2,r4
+    eors    r2,r2,r6
+    movs    r4,#(3)
+    rors    r5,r5,r4
+    eors    r2,r2,r5
+    ldr    r4,[r7,#R32_3]
+    eors    r4,r4,r2
+    str    r4,[r7,#R32_3]
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    movs    r6,r3
+    movs    r5,#(5)
+    rors    r3,r3,r5
+    eors    r6,r6,r3
+    ldr     r5,[r7,#C3L]
+    movs    r4,#(9)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C3H]
+    ldr    r4,[r7,#R32_2]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_2]
+    //c3 low part
+    movs    r4,#(32-9)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-5+8)%32)
+    rors    r3,r3,r4
+    eors    r3,r3,r6
+    movs    r4,#(5)
+    rors    r5,r5,r4
+    eors    r3,r3,r5
+    ldr    r4,[r7,#R32_1]
+    eors    r4,r4,r3
+    str    r4,[r7,#R32_1]
+
+    ldr     r4,[r7,#C4L]
+    ldr     r5,[sp,#4]
+
+    ldr		r6,[sp,#8]
+    subs    r6,#1
+    bmi     drygascon128_g_v6m_exit
+
+    str     r6,[sp,#8]
+	b    	drygascon128_g_v6m_main_loop
+drygascon128_g_v6m_exit:
+
+    str		r3,[r7,#C3L]
+	str		r2,[r7,#C2L]
+	str		r1,[r7,#C1L]
+	str		r0,[r7,#C0L]
+
+	add		sp,sp,#12
+	pop 	{r4, r5, r6, r7, pc}
+.size	drygascon128_g_v6m, .-drygascon128_g_v6m
+
+.align 2
+.type	drygascon128_f_v6m, %function
+drygascon128_f_v6m:
+    //r0:state c r x
+    //r1:input -> shall be 32 bit aligned
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, lr}
+
+    //stack frame:
+    //0 ~ 28-1: buf
+    //28 :pointer on c
+    //32 : rounds for g
+    //36 :mix round / g round
+
+    movs    r4,#26
+    push    {r0,r3,r4}
+    sub     sp,sp,#28
+
+    //load 10 bit mask in r4 = 0x3FF
+    movs    r4,#0xFF
+    lsls    r4,r4,#2
+    adds    r4,r4,#3
+
+    movs    r7,#0
+    //r=0
+    str     r7,[r0,#R32_0]
+    str     r7,[r0,#R32_1]
+    str     r7,[r0,#R32_2]
+    str     r7,[r0,#R32_3]
+
+    //r7 = sp
+    add     r7,r7,sp
+
+    ldr     r3,[r1]
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+26]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+24]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+22]
+
+    lsrs    r5,r3,#10
+    ldr     r3,[r1,#4]
+    lsls    r6,r3,#2
+    lsrs    r3,r3,#8
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+20]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+18]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+16]
+
+    lsrs    r5,r3,#10
+    ldr     r3,[r1,#8]
+    lsls    r6,r3,#4
+    lsrs    r3,r3,#6
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+14]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+12]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+10]
+
+    lsrs    r5,r3,#10
+    ldr     r3,[r1,#12]
+    lsls    r6,r3,#6
+    lsrs    r3,r3,#4
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+8]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+6]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+4]
+
+    lsrs    r5,r3,#10
+    lsls    r6,r2,#8
+    lsrs    r3,r2,#2
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+2]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+0]
+
+    movs    r7,#26
+
+drygascon128_f_v6m_mix128_main_loop:
+    movs    r6,#0
+    add     r6,r6,sp
+    ldrh 	r6,[r6,r7]
+
+    ldr     r5,[sp,#28]
+    movs    r7,r5
+    adds    r5,r5,#X0
+    movs    r4,#0xc
+
+    lsls    r0,r6,#2
+    ands    r0,r0,r4
+    ldr     r1,[r5,r0]
+    ldr     r0,[r7,#0*8]
+    eors    r0,r0,r1
+
+    lsrs    r1,r6,#0
+    ands    r1,r1,r4
+    ldr     r2,[r5,r1]
+    ldr     r1,[r7,#1*8]
+    eors    r1,r1,r2
+
+    lsrs    r2,r6,#2
+    ands    r2,r2,r4
+    ldr     r3,[r5,r2]
+    ldr     r2,[r7,#2*8]
+    eors    r2,r2,r3
+
+    lsrs    r3,r6,#4
+    ands    r3,r3,r4
+    ldr     r4,[r5,r3]
+    ldr     r3,[r7,#3*8]
+    eors    r3,r3,r4
+
+    lsrs    r4,r6,#6+2
+    lsls    r4,r4,#2
+    ldr     r6,[r5,r4]
+    ldr     r4,[r7,#4*8]
+    eors    r4,r4,r6
+
+    ldr		r6,[sp,#36]
+    subs    r6,#2
+    bpl     drygascon128_f_v6m_mix128_coreround
+    b		drygascon128_f_v6m_mix128_exit
+drygascon128_f_v6m_mix128_coreround:
+    str     r6,[sp,#36]
+
+    movs    r6,#0xf0
+        // addition of round constant
+    //r2 ^= r6;
+    eors	r2,r2,r6
+
+    // substitution layer, lower half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    ldr		r7,[sp,#28]
+    str		r4,[r7,#C4L]
+    str		r3,[r7,#C3L]
+    str		r2,[r7,#C2L]
+    str		r1,[r7,#C1L]
+    str		r0,[r7,#C0L]
+
+    ldr		r4,[r7,#C4H]
+    ldr		r3,[r7,#C3H]
+    ldr		r2,[r7,#C2H]
+    ldr		r1,[r7,#C1H]
+    ldr		r0,[r7,#C0H]
+
+    // substitution layer, upper half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    // linear diffusion layer
+    ldr		r7,[sp,#28]
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    movs    r6,r4
+    movs    r5,#(20)
+    rors    r4,r4,r5
+    eors    r6,r6,r4
+    ldr     r5,[r7,#C4L]
+    movs    r7,#(4)
+    rors    r5,r5,r7
+    eors    r6,r6,r5
+    ldr		r7,[sp,#28]
+    str     r6,[r7,#C4H]
+    //c4 low part
+    movs    r7,#(32-4)
+    rors    r5,r5,r7
+    movs    r6,r5
+    movs    r7,#((32-20+3)%32)
+    rors    r4,r4,r7
+    eors    r4,r4,r6
+    movs    r7,#(20)
+    rors    r5,r5,r7
+    eors    r4,r4,r5
+    ldr		r7,[sp,#28]
+    str     r4,[r7,#C4L]
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    movs    r6,r0
+    movs    r5,#(14)
+    rors    r0,r0,r5
+    eors    r6,r6,r0
+    ldr     r5,[r7,#C0L]
+    movs    r4,#(10)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C0H]
+    //c0 low part
+    movs    r4,#(32-10)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-14+9)%32)
+    rors    r0,r0,r4
+    eors    r0,r0,r6
+    movs    r4,#(14)
+    rors    r5,r5,r4
+    eors    r0,r0,r5
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    movs    r6,r1
+    movs    r5,#(19)
+    rors    r1,r1,r5
+    eors    r6,r6,r1
+    ldr     r5,[r7,#C1L]
+    movs    r4,#(31)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C1H]
+    //c1 low part
+    movs    r4,#(32-31)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-19+30)%32)
+    rors    r1,r1,r4
+    eors    r1,r1,r6
+    movs    r4,#(19)
+    rors    r5,r5,r4
+    eors    r1,r1,r5
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    movs    r6,r2
+    movs    r5,#(3)
+    rors    r2,r2,r5
+    eors    r6,r6,r2
+    ldr     r5,[r7,#C2L]
+    movs    r4,#(1)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C2H]
+    //c2 low part
+    movs    r4,#(32-1)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-3+0)%32)
+    rors    r2,r2,r4
+    eors    r2,r2,r6
+    movs    r4,#(3)
+    rors    r5,r5,r4
+    eors    r2,r2,r5
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    movs    r6,r3
+    movs    r5,#(5)
+    rors    r3,r3,r5
+    eors    r6,r6,r3
+    ldr     r5,[r7,#C3L]
+    movs    r4,#(9)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C3H]
+    //c3 low part
+    movs    r4,#(32-9)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-5+8)%32)
+    rors    r3,r3,r4
+    eors    r3,r3,r6
+    movs    r4,#(5)
+    rors    r5,r5,r4
+    eors    r3,r3,r5
+
+    str		r3,[r7,#C3L]
+    str		r2,[r7,#C2L]
+    str		r1,[r7,#C1L]
+    str		r0,[r7,#C0L]
+
+    ldr		r7,[sp,#36]
+
+    b    	drygascon128_f_v6m_mix128_main_loop
+drygascon128_f_v6m_mix128_exit:
+    ldr     r7,[sp,#32]
+    //round=r5=rounds-1;
+    subs    r6,r7,#1
+    //base = round_cst+12-rounds
+    adr		r5, round_cst
+    adds    r5,r5,#12
+    subs    r5,r5,r7
+
+    add		sp,sp,#28
+    str     r5,[sp,#4]
+    str     r6,[sp,#8]
+
+    //push    {r0,r1,r2,r3}
+    //ldr     r0,[sp,#16]
+    //bl      print_state
+    //pop     {r0,r1,r2,r3}
+
+    b       drygascon128_g_v6m_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v6m, .-drygascon128_f_v6m
+
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
+/**
+DryGascon128 'v7m implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M7/M4/M3 (Size and Speed)
+Safe against timing attack on X look up operations under
+the following conditions: (safe if at least one line is true)
+- System without cache
+- State stored in non cacheable memory (like DTCM)
+- Cache lines are 16 bytes or larger AND X is 16 bytes aligned
+
+
+Notes:
+- Arm Cortex-M7 Processor Technical Reference Manual Revision r1p2 states
+  that data cache line size is 32 bytes.
+- Microchip app note TB3186 shows that Microchip use 16 bytes cache lines.
+- ST does not give a general statement about cache lines for its products based
+on M3 and M4. That said STM32F411xC/E datasheet (RM0383
+Reference manual) shows data cache lines of 16 bytes.
+- In the unlikely case in which none of the condition can be met,
+the 'v7m_fpu_x' can be used to prevent this attack.
+
+Note that implementation 'v7m_fpu' is faster (but requires FPU).
+*/
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m3
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v7m
+.global	drygascon128_f_v7m
+.global	drygascon128_g0_v7m
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+
+.type	drygascon128_g_v7m, %function
+drygascon128_g_v7m:
+    //r0: state: c,r,x
+    //r1: rounds
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+    //stack vars:
+    // 8 round
+    // 4 rounds (base address for lookups)
+    // 0 state address
+
+    //r=0
+    movs    r10,#0
+    str     r10,[r0,#R32_0]
+    str     r10,[r0,#R32_1]
+    str     r10,[r0,#R32_2]
+    str     r10,[r0,#R32_3]
+
+    //round=r10=rounds-1;
+    subs    r11,r1,#1
+    //base = round_cst+12-rounds
+    adr     r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r1
+
+    push	{r0,r10,r11}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //loop entry
+    //assume r11>0 at entry
+drygascon128_g_v7m_main_loop:
+    //r0~r9: c
+    //r10: base for round constants
+    //r11: round, counting from rounds-1 to 0
+
+    //r11 = ((0xf - r11) << 4) | r11;
+    ldrb    r11,[r10,r11]
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+    //r14: pointer on C
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    ldr     r12,[r14,#R32_1-C0]
+    eors    r12,r12,r1
+    str     r12,[r14,#R32_1-C0]
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+    ldr     r12,[r14,#R32_0-C0]
+    eors    r12,r12,r0
+    str     r12,[r14,#R32_0-C0]
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    ldr     r12,[r14,#R32_3-C0]
+    eors    r12,r12,r3
+    str     r12,[r14,#R32_3-C0]
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+    ldr     r12,[r14,#R32_2-C0]
+    eors    r12,r12,r2
+    str     r12,[r14,#R32_2-C0]
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    ldr     r12,[r14,#R32_0-C0]
+    eors    r12,r12,r5
+    str     r12,[r14,#R32_0-C0]
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+    ldr     r12,[r14,#R32_3-C0]
+    eors    r12,r12,r4
+    str     r12,[r14,#R32_3-C0]
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    ldr     r12,[r14,#R32_2-C0]
+    eors    r12,r12,r7
+    str     r12,[r14,#R32_2-C0]
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+    ldr     r12,[r14,#R32_1-C0]
+    eors    r12,r12,r6
+    str     r12,[r14,#R32_1-C0]
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr     r10,[sp,#4]
+
+    ldr     r11,[sp,#8]
+    subs    r11,#1
+    bmi     drygascon128_g_v7m_exit
+
+    str     r11,[sp,#8]
+    b       drygascon128_g_v7m_main_loop
+drygascon128_g_v7m_exit:
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    add    sp,sp,#12
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g_v7m, .-drygascon128_g_v7m
+
+.align 2
+.type	drygascon128_f_v7m, %function
+drygascon128_f_v7m:
+    //r0:state
+    //r1:input
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //stack frame:
+    //0: pointer on input
+    //4: DS value
+    //8 :pointer on state
+    //12 : rounds for g
+    //16 :mix round / g round
+
+    movs    r10,#0 //init of input bit counter
+    push    {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
+    push    {r1,r2}
+    //r=0
+    str     r10,[r0,#R32_0]
+    str     r10,[r0,#R32_1]
+    str     r10,[r0,#R32_2]
+    str     r10,[r0,#R32_3]
+
+    //Load C
+    adds	r11,r0,#C0
+    LDMIA.W r11,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+
+drygascon128_f_v7m_mix128_main_loop:
+    //r10 is input bit counter
+    ldr     r11,[sp,#0] //r11 is pointer on input
+
+    //r10 r12 shift
+    // 0   0   0
+    // 10  1   2
+    // 20  2   4
+    // 30  3   6
+    // 40  5   0
+    // 50  6   2
+    // 60  7   4
+    // 70  8   6
+    // 80  10  0
+    // 90  11  2
+    // 100 12  4
+    // 110 13  6
+    // 120 15  0
+    // 130 16  2 --> we do that operation for 2 last bits in a special last loop
+
+    cmp     r10,#120
+    bne     drygascon128_f_v7m_mix128_main_loop.regular
+
+    //we execute this only during the pen-ultimate operation
+    //we add the 2 lsb from DS to r14
+    ldrb    r14,[r11,#15]
+    ldr     r10,[sp,#4]
+    lsl     r10,r10,#8
+    eors    r14,r14,r10
+    b       drygascon128_f_v7m_mix128_main_loop.core
+
+drygascon128_f_v7m_mix128_main_loop.regular:
+    //r12 is base byte: byte offset to read from input buffer
+    lsr     r12,r10,#3  //divide by 8 to get base byte
+    //r10 becomes shift
+    lsl     r14,r12,#3
+    sub     r10,r10,r14
+
+    ldr     r14,[r11,r12] //M7 supports unalign access with ldr
+    lsr     r14,r14,r10
+
+drygascon128_f_v7m_mix128_main_loop.core:
+    ldr     r10,[sp,#8]
+    adds    r10,r10,#X0
+    
+    lsls    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r0,r0,r11
+    
+    lsrs    r11,r14,#0
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r2,r2,r11
+    
+    lsrs    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r4,r4,r11
+    
+    lsrs    r11,r14,#4
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r6,r6,r11
+    
+    lsrs    r11,r14,#6
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r8,r8,r11
+
+    ldr		r10,[sp,#16]
+    adds    r10,#10
+    cmp     r10,#140
+    beq     drygascon128_f_v7m_mix128_exit
+drygascon128_f_v7m_mix128_coreround:
+    str     r10,[sp,#16]
+
+    movs    r11,#0xf0
+
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr		r10,[sp,#16]
+    cmp     r10,#130
+    bne     drygascon128_f_v7m_mix128_main_loop
+    //prepare the last loop: load DS 2 msb
+    ldr     r14,[sp,4]
+    lsr     r14,r14,#2
+    b       drygascon128_f_v7m_mix128_main_loop.core
+
+drygascon128_f_v7m_mix128_exit:
+    ldr     r14,[sp,#12]
+    //round=r10=rounds-1;
+    subs    r11,r14,#1
+    //base = round_cst+12-rounds
+    adr		r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r14
+
+    str     r10,[sp,#12]
+    str     r11,[sp,#16]
+
+    ldr		r14,[sp,#8]
+    add		sp,sp,#8
+    b       drygascon128_g_v7m_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v7m, .-drygascon128_f_v7m
+
+.type	drygascon128_g0_v7m, %function
+drygascon128_g0_v7m:
+    //perform a single round without accumulate
+    //r0: state
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //r0~r9: c
+
+    //r11 = ((0xf - 0) << 4) | 0;
+    movs    r11,#0xf0
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g0_v7m, .-drygascon128_g0_v7m
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
+/**
+DryGascon128 'v7m_fpu implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M7/M4/M3 (Size and Speed)
+Safe against timing attack on X look up operations under
+the following conditions: (safe if at least one line is true)
+- System without cache
+- State stored in non cacheable memory (like DTCM)
+- Cache lines are 16 bytes or larger AND X is 16 bytes aligned
+
+
+Notes:
+- Arm Cortex-M7 Processor Technical Reference Manual Revision r1p2 states
+  that data cache line size is 32 bytes.
+- Microchip app note TB3186 shows that Microchip use 16 bytes cache lines.
+- ST does not give a general statement about cache lines for its products based
+on M3 and M4. That said STM32F411xC/E datasheet (RM0383
+Reference manual) shows data cache lines of 16 bytes.
+- In the unlikely case in which none of the condition can be met,
+the 'v7m_fpu_x' can be used to prevent this attack.
+*/
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m3
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v7m_fpu
+.global	drygascon128_f_v7m_fpu
+.global	drygascon128_g0_v7m_fpu
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+
+.type	drygascon128_g_v7m_fpu, %function
+drygascon128_g_v7m_fpu:
+    //r0: state: c,r,x
+    //r1: rounds
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+    //stack vars:
+    // 8 round
+    // 4 rounds (base address for lookups)
+    // 0 state address
+
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //round=r10=rounds-1;
+    subs    r11,r1,#1
+    //base = round_cst+12-rounds
+    adr     r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r1
+
+    push	{r0,r10,r11}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //loop entry
+    //assume r11>0 at entry
+drygascon128_g_v7m_fpu_main_loop:
+    //r0~r9: c
+    //r10: base for round constants
+    //r11: round, counting from rounds-1 to 0
+
+    //r11 = ((0xf - r11) << 4) | r11;
+    ldrb    r11,[r10,r11]
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+    //FPU:
+    //s11 to s14: r
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    vmov r14,S11
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //r14 is R32_1
+    eors    r14,r14,r1
+    vmov r12,S10
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+    //r12 is R32_0
+    eors    r12,r12,r0
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //r12 is R32_0
+    eors    r12,r12,r5
+    vmov S10,r12
+    vmov r12,S13
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+    //r12 is R32_3
+    eors    r12,r12,r4
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //r12 is R32_3
+    eors    r12,r12,r3
+    vmov S13,r12
+    vmov r12,S12
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+    //r12 is R32_2
+    eors    r12,r12,r2
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //r12 is R32_2
+    eors    r12,r12,r7
+    vmov S12,r12
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+    //r14 is R32_1
+    eors    r14,r14,r6
+    vmov S11,r14
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr     r10,[sp,#4]
+
+    ldr     r11,[sp,#8]
+    subs    r11,#1
+    bmi     drygascon128_g_v7m_fpu_exit
+
+    str     r11,[sp,#8]
+    b       drygascon128_g_v7m_fpu_main_loop
+drygascon128_g_v7m_fpu_exit:
+    //update C
+    ldr     r14,[sp,#0]
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //update R
+    ldr        r11,[sp,#0]
+    adds       r11,r11,#R0
+    VSTMIA.F32 r11, {S10,S11,S12,S13}
+
+    add    sp,sp,#12
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g_v7m_fpu, .-drygascon128_g_v7m_fpu
+
+.align 2
+.type	drygascon128_f_v7m_fpu, %function
+drygascon128_f_v7m_fpu:
+    //r0:state
+    //r1:input
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //stack frame:
+    //0: pointer on input
+    //4: DS value
+    //8 :pointer on state
+    //12 : rounds for g
+    //16 :mix round / g round
+
+    movs    r10,#0 //init of input bit counter
+    push    {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
+    push    {r1,r2}
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //Load C
+    adds	r11,r0,#C0
+    LDMIA.W r11,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+
+drygascon128_f_v7m_fpu_mix128_main_loop:
+    //r10 is input bit counter
+    ldr     r11,[sp,#0] //r11 is pointer on input
+
+    //r10 r12 shift
+    // 0   0   0
+    // 10  1   2
+    // 20  2   4
+    // 30  3   6
+    // 40  5   0
+    // 50  6   2
+    // 60  7   4
+    // 70  8   6
+    // 80  10  0
+    // 90  11  2
+    // 100 12  4
+    // 110 13  6
+    // 120 15  0
+    // 130 16  2 --> we do that operation for 2 last bits in a special last loop
+
+    cmp     r10,#120
+    bne     drygascon128_f_v7m_fpu_mix128_main_loop.regular
+
+    //we execute this only during the pen-ultimate operation
+    //we add the 2 lsb from DS to r14
+    ldrb    r14,[r11,#15]
+    ldr     r10,[sp,#4]
+    lsl     r10,r10,#8
+    eors    r14,r14,r10
+    b       drygascon128_f_v7m_fpu_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_mix128_main_loop.regular:
+    //r12 is base byte: byte offset to read from input buffer
+    lsr     r12,r10,#3  //divide by 8 to get base byte
+    //r10 becomes shift
+    lsl     r14,r12,#3
+    sub     r10,r10,r14
+
+    ldr     r14,[r11,r12] //M7 supports unalign access with ldr
+    lsr     r14,r14,r10
+
+drygascon128_f_v7m_fpu_mix128_main_loop.core:
+    ldr     r10,[sp,#8]
+    adds    r10,r10,#X0
+    
+    lsls    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r0,r0,r11
+    
+    lsrs    r11,r14,#0
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r2,r2,r11
+    
+    lsrs    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r4,r4,r11
+    
+    lsrs    r11,r14,#4
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r6,r6,r11
+    
+    lsrs    r11,r14,#6
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r8,r8,r11
+
+    ldr		r10,[sp,#16]
+    adds    r10,#10
+    cmp     r10,#140
+    beq     drygascon128_f_v7m_fpu_mix128_exit
+drygascon128_f_v7m_fpu_mix128_coreround:
+    str     r10,[sp,#16]
+
+    movs    r11,#0xf0
+
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr		r10,[sp,#16]
+    cmp     r10,#130
+    bne     drygascon128_f_v7m_fpu_mix128_main_loop
+    //prepare the last loop: load DS 2 msb
+    ldr     r14,[sp,4]
+    lsr     r14,r14,#2
+    b       drygascon128_f_v7m_fpu_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_mix128_exit:
+    ldr     r14,[sp,#12]
+    //round=r10=rounds-1;
+    subs    r11,r14,#1
+    //base = round_cst+12-rounds
+    adr		r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r14
+
+    str     r10,[sp,#12]
+    str     r11,[sp,#16]
+
+    add		sp,sp,#8
+    b       drygascon128_g_v7m_fpu_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v7m_fpu, .-drygascon128_f_v7m_fpu
+
+.type	drygascon128_g0_v7m_fpu, %function
+drygascon128_g0_v7m_fpu:
+    //perform a single round without accumulate
+    //r0: state
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //r0~r9: c
+
+    //r11 = ((0xf - 0) << 4) | 0;
+    movs    r11,#0xf0
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g0_v7m_fpu, .-drygascon128_g0_v7m_fpu
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
+/**
+DryGascon128 'v7m_fpu_x implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M7/M4/M3 (Size and Speed)
+Include protection against timing attack on X look up operations
+
+Note that implementation 'v7m_fpu' is faster and safe on all Cortex-M7 as of May 2020.
+*/
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m7
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v7m_fpu_x
+.global	drygascon128_f_v7m_fpu_x
+.global	drygascon128_g0_v7m_fpu_x
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+
+.type	drygascon128_g_v7m_fpu_x, %function
+drygascon128_g_v7m_fpu_x:
+    //r0: state: c,r,x
+    //r1: rounds
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+    //stack vars:
+    // 8 round
+    // 4 rounds (base address for lookups)
+    // 0 state address
+
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //round=r10=rounds-1;
+    subs    r11,r1,#1
+    //base = round_cst+12-rounds
+    adr     r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r1
+
+    push	{r0,r10,r11}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //loop entry
+    //assume r11>0 at entry
+drygascon128_g_v7m_fpu_x_main_loop:
+    //r0~r9: c
+    //r10: base for round constants
+    //r11: round, counting from rounds-1 to 0
+
+    //r11 = ((0xf - r11) << 4) | r11;
+    ldrb    r11,[r10,r11]
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+    //FPU:
+    //s11 to s14: r
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    vmov r14,S11
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //r14 is R32_1
+    eors    r14,r14,r1
+    vmov r12,S10
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+    //r12 is R32_0
+    eors    r12,r12,r0
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //r12 is R32_0
+    eors    r12,r12,r5
+    vmov S10,r12
+    vmov r12,S13
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+    //r12 is R32_3
+    eors    r12,r12,r4
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //r12 is R32_3
+    eors    r12,r12,r3
+    vmov S13,r12
+    vmov r12,S12
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+    //r12 is R32_2
+    eors    r12,r12,r2
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //r12 is R32_2
+    eors    r12,r12,r7
+    vmov S12,r12
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+    //r14 is R32_1
+    eors    r14,r14,r6
+    vmov S11,r14
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr     r10,[sp,#4]
+
+    ldr     r11,[sp,#8]
+    subs    r11,#1
+    bmi     drygascon128_g_v7m_fpu_x_exit
+
+    str     r11,[sp,#8]
+    b       drygascon128_g_v7m_fpu_x_main_loop
+drygascon128_g_v7m_fpu_x_exit:
+    //update C
+    ldr     r14,[sp,#0]
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //update R
+    ldr        r11,[sp,#0]
+    adds       r11,r11,#R0
+    VSTMIA.F32 r11, {S10,S11,S12,S13}
+
+    add    sp,sp,#12
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g_v7m_fpu_x, .-drygascon128_g_v7m_fpu_x
+
+.align 2
+.type	drygascon128_f_v7m_fpu_x, %function
+drygascon128_f_v7m_fpu_x:
+    //r0:state
+    //r1:input
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //stack frame:
+    //0: pointer on input
+    //4: DS value
+    //8 :pointer on state
+    //12 : rounds for g
+    //16 :mix round / g round
+
+    movs    r10,#0 //init of input bit counter
+    push    {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
+    push    {r1,r2}
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //Load C
+    adds	r11,r0,#C0
+    LDMIA.W r11,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //Load X
+    adds       r11,#X0
+    VLDMIA.F32 r11, {s0,s1,s2,s3}
+
+drygascon128_f_v7m_fpu_x_mix128_main_loop:
+    //r10 is input bit counter
+    ldr     r11,[sp,#0] //r11 is pointer on input
+
+    //r10 r12 shift
+    // 0   0   0
+    // 10  1   2
+    // 20  2   4
+    // 30  3   6
+    // 40  5   0
+    // 50  6   2
+    // 60  7   4
+    // 70  8   6
+    // 80  10  0
+    // 90  11  2
+    // 100 12  4
+    // 110 13  6
+    // 120 15  0
+    // 130 16  2 --> we do that operation for 2 last bits in a special last loop
+
+    cmp     r10,#120
+    bne     drygascon128_f_v7m_fpu_x_mix128_main_loop.regular
+
+    //we execute this only during the pen-ultimate operation
+    //we add the 2 lsb from DS to r14
+    ldrb    r14,[r11,#15]
+    ldr     r10,[sp,#4]
+    lsl     r10,r10,#8
+    eors    r14,r14,r10
+    b       drygascon128_f_v7m_fpu_x_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_x_mix128_main_loop.regular:
+    //r12 is base byte: byte offset to read from input buffer
+    lsr     r12,r10,#3  //divide by 8 to get base byte
+    //r10 becomes shift
+    lsl     r14,r12,#3
+    sub     r10,r10,r14
+
+    ldr     r14,[r11,r12] //M7 supports unalign access with ldr
+    lsr     r14,r14,r10
+
+drygascon128_f_v7m_fpu_x_mix128_main_loop.core:
+    
+    tst     r14,#2
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#1
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r0,r0,r11
+    
+    tst     r14,#8
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#4
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r2,r2,r11
+    
+    tst     r14,#32
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#16
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r4,r4,r11
+    
+    tst     r14,#128
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#64
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r6,r6,r11
+    
+    tst     r14,#512
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#256
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r8,r8,r11
+
+    ldr		r10,[sp,#16]
+    adds    r10,#10
+    cmp     r10,#140
+    beq     drygascon128_f_v7m_fpu_x_mix128_exit
+drygascon128_f_v7m_fpu_x_mix128_coreround:
+    str     r10,[sp,#16]
+
+    movs    r11,#0xf0
+
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr		r10,[sp,#16]
+    cmp     r10,#130
+    bne     drygascon128_f_v7m_fpu_x_mix128_main_loop
+    //prepare the last loop: load DS 2 msb
+    ldr     r14,[sp,4]
+    lsr     r14,r14,#2
+    b       drygascon128_f_v7m_fpu_x_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_x_mix128_exit:
+    ldr     r14,[sp,#12]
+    //round=r10=rounds-1;
+    subs    r11,r14,#1
+    //base = round_cst+12-rounds
+    adr		r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r14
+
+    str     r10,[sp,#12]
+    str     r11,[sp,#16]
+
+    add		sp,sp,#8
+    b       drygascon128_g_v7m_fpu_x_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v7m_fpu_x, .-drygascon128_f_v7m_fpu_x
+
+.type	drygascon128_g0_v7m_fpu_x, %function
+drygascon128_g0_v7m_fpu_x:
+    //perform a single round without accumulate
+    //r0: state
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //r0~r9: c
+
+    //r11 = ((0xf - 0) << 4) | 0;
+    movs    r11,#0xf0
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g0_v7m_fpu_x, .-drygascon128_g0_v7m_fpu_x
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
+#ifndef __DRYGASCON_ARM_SELECTOR_H__
+#define __DRYGASCON_ARM_SELECTOR_H__
+//Optional file to select the best implementation for each chip
+
+#ifdef STM32H743xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+    #define __DRYGASCON_ARM_SELECTOR_FPU__
+#endif
+
+#ifdef STM32F746xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+    #define __DRYGASCON_ARM_SELECTOR_FPU__
+#endif
+
+#ifdef STM32F103xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+#endif
+
+#ifdef STM32L011xx
+    #define __DRYGASCON_ARM_SELECTOR_V6M__
+#endif
+
+#ifdef __SAM3X8E__
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+#endif
+
+//TODO: add more chips here
+
+#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
+    #ifdef __DRYGASCON_ARM_SELECTOR_FPU__
+        #define DRYGASCON_G_OPT   drygascon128_g_v7m_fpu
+        #define DRYGASCON_F_OPT   drygascon128_f_v7m_fpu
+        #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu
+    #else
+        #define DRYGASCON_G_OPT   drygascon128_g_v7m
+        #define DRYGASCON_F_OPT   drygascon128_f_v7m
+        #define DRYGASCON_G0_OPT drygascon128_g0_v7m
+    #endif
+#endif
+
+#ifdef __DRYGASCON_ARM_SELECTOR_V6M__
+        #define DRYGASCON_G_OPT   drygascon128_g_v6m
+        #define DRYGASCON_F_OPT   drygascon128_f_v6m
+        //#define DRYGASCON_G0_OPT drygascon128_g0_v6m
+        #define DRYGASCON_ALIGN_INPUT_32
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
+#include "drygascon.h"
+
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    return drygascon128_aead_encrypt
+        (c, clen, m, mlen, ad, adlen, nsec, npub, k);
+}
+
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    return drygascon128_aead_decrypt
+        (m, mlen, nsec, c, clen, ad, adlen, npub, k);
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/implementors
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/implementors
+Rhys Weatherley
+Sebastien Riou
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.c
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "internal-drysponge.h"
+#include <string.h>
+
+#if !defined(__AVR__)
+
+/* Right rotations in bit-interleaved format */
+#define intRightRotateEven(x,bits) \
+    (__extension__ ({ \
+        uint32_t _x0 = (uint32_t)(x); \
+        uint32_t _x1 = (uint32_t)((x) >> 32); \
+        _x0 = rightRotate(_x0, (bits)); \
+        _x1 = rightRotate(_x1, (bits)); \
+        _x0 | (((uint64_t)_x1) << 32); \
+    }))
+#define intRightRotateOdd(x,bits) \
+    (__extension__ ({ \
+        uint32_t _x0 = (uint32_t)(x); \
+        uint32_t _x1 = (uint32_t)((x) >> 32); \
+        _x0 = rightRotate(_x0, ((bits) + 1) % 32); \
+        _x1 = rightRotate(_x1, (bits)); \
+        _x1 | (((uint64_t)_x0) << 32); \
+    }))
+#define intRightRotate1_64(x) \
+    (__extension__ ({ \
+        uint32_t _x0 = (uint32_t)(x); \
+        uint32_t _x1 = (uint32_t)((x) >> 32); \
+        _x0 = rightRotate1(_x0); \
+        _x1 | (((uint64_t)_x0) << 32); \
+    }))
+#define intRightRotate2_64(x)  (intRightRotateEven((x), 1))
+#define intRightRotate3_64(x)  (intRightRotateOdd((x), 1))
+#define intRightRotate4_64(x)  (intRightRotateEven((x), 2))
+#define intRightRotate5_64(x)  (intRightRotateOdd((x), 2))
+#define intRightRotate6_64(x)  (intRightRotateEven((x), 3))
+#define intRightRotate7_64(x)  (intRightRotateOdd((x), 3))
+#define intRightRotate8_64(x)  (intRightRotateEven((x), 4))
+#define intRightRotate9_64(x)  (intRightRotateOdd((x), 4))
+#define intRightRotate10_64(x) (intRightRotateEven((x), 5))
+#define intRightRotate11_64(x) (intRightRotateOdd((x), 5))
+#define intRightRotate12_64(x) (intRightRotateEven((x), 6))
+#define intRightRotate13_64(x) (intRightRotateOdd((x), 6))
+#define intRightRotate14_64(x) (intRightRotateEven((x), 7))
+#define intRightRotate15_64(x) (intRightRotateOdd((x), 7))
+#define intRightRotate16_64(x) (intRightRotateEven((x), 8))
+#define intRightRotate17_64(x) (intRightRotateOdd((x), 8))
+#define intRightRotate18_64(x) (intRightRotateEven((x), 9))
+#define intRightRotate19_64(x) (intRightRotateOdd((x), 9))
+#define intRightRotate20_64(x) (intRightRotateEven((x), 10))
+#define intRightRotate21_64(x) (intRightRotateOdd((x), 10))
+#define intRightRotate22_64(x) (intRightRotateEven((x), 11))
+#define intRightRotate23_64(x) (intRightRotateOdd((x), 11))
+#define intRightRotate24_64(x) (intRightRotateEven((x), 12))
+#define intRightRotate25_64(x) (intRightRotateOdd((x), 12))
+#define intRightRotate26_64(x) (intRightRotateEven((x), 13))
+#define intRightRotate27_64(x) (intRightRotateOdd((x), 13))
+#define intRightRotate28_64(x) (intRightRotateEven((x), 14))
+#define intRightRotate29_64(x) (intRightRotateOdd((x), 14))
+#define intRightRotate30_64(x) (intRightRotateEven((x), 15))
+#define intRightRotate31_64(x) (intRightRotateOdd((x), 15))
+#define intRightRotate32_64(x) (intRightRotateEven((x), 16))
+#define intRightRotate33_64(x) (intRightRotateOdd((x), 16))
+#define intRightRotate34_64(x) (intRightRotateEven((x), 17))
+#define intRightRotate35_64(x) (intRightRotateOdd((x), 17))
+#define intRightRotate36_64(x) (intRightRotateEven((x), 18))
+#define intRightRotate37_64(x) (intRightRotateOdd((x), 18))
+#define intRightRotate38_64(x) (intRightRotateEven((x), 19))
+#define intRightRotate39_64(x) (intRightRotateOdd((x), 19))
+#define intRightRotate40_64(x) (intRightRotateEven((x), 20))
+#define intRightRotate41_64(x) (intRightRotateOdd((x), 20))
+#define intRightRotate42_64(x) (intRightRotateEven((x), 21))
+#define intRightRotate43_64(x) (intRightRotateOdd((x), 21))
+#define intRightRotate44_64(x) (intRightRotateEven((x), 22))
+#define intRightRotate45_64(x) (intRightRotateOdd((x), 22))
+#define intRightRotate46_64(x) (intRightRotateEven((x), 23))
+#define intRightRotate47_64(x) (intRightRotateOdd((x), 23))
+#define intRightRotate48_64(x) (intRightRotateEven((x), 24))
+#define intRightRotate49_64(x) (intRightRotateOdd((x), 24))
+#define intRightRotate50_64(x) (intRightRotateEven((x), 25))
+#define intRightRotate51_64(x) (intRightRotateOdd((x), 25))
+#define intRightRotate52_64(x) (intRightRotateEven((x), 26))
+#define intRightRotate53_64(x) (intRightRotateOdd((x), 26))
+#define intRightRotate54_64(x) (intRightRotateEven((x), 27))
+#define intRightRotate55_64(x) (intRightRotateOdd((x), 27))
+#define intRightRotate56_64(x) (intRightRotateEven((x), 28))
+#define intRightRotate57_64(x) (intRightRotateOdd((x), 28))
+#define intRightRotate58_64(x) (intRightRotateEven((x), 29))
+#define intRightRotate59_64(x) (intRightRotateOdd((x), 29))
+#define intRightRotate60_64(x) (intRightRotateEven((x), 30))
+#define intRightRotate61_64(x) (intRightRotateOdd((x), 30))
+#define intRightRotate62_64(x) (intRightRotateEven((x), 31))
+#define intRightRotate63_64(x) (intRightRotateOdd((x), 31))
+
+#ifdef DRYGASCON_G0_OPT
+void DRYGASCON_G0_OPT(drysponge128_state_t *state);
+static void gascon128_g0(drysponge128_state_t *state){
+	 DRYGASCON_G0_OPT(state);
+}
+#else
+void gascon128_core_round(gascon128_state_t *state, uint8_t round)
+{
+    uint64_t t0, t1, t2, t3, t4;
+
+    /* Load the state into local varaibles */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    uint64_t x0 = state->S[0];
+    uint64_t x1 = state->S[1];
+    uint64_t x2 = state->S[2];
+    uint64_t x3 = state->S[3];
+    uint64_t x4 = state->S[4];
+#else
+    uint64_t x0 = le_load_word64(state->B);
+    uint64_t x1 = le_load_word64(state->B + 8);
+    uint64_t x2 = le_load_word64(state->B + 16);
+    uint64_t x3 = le_load_word64(state->B + 24);
+    uint64_t x4 = le_load_word64(state->B + 32);
+#endif
+
+    /* Add the round constant to the middle of the state */
+    x2 ^= ((0x0F - round) << 4) | round;
+
+    /* Substitution layer */
+    x0 ^= x4; x2 ^= x1; x4 ^= x3; t0 = (~x0) & x1; t1 = (~x1) & x2;
+    t2 = (~x2) & x3; t3 = (~x3) & x4; t4 = (~x4) & x0; x0 ^= t1;
+    x1 ^= t2; x2 ^= t3; x3 ^= t4; x4 ^= t0; x1 ^= x0; x3 ^= x2;
+    x0 ^= x4; x2 = ~x2;
+
+    /* Linear diffusion layer */
+    x0 ^= intRightRotate19_64(x0) ^ intRightRotate28_64(x0);
+    x1 ^= intRightRotate61_64(x1) ^ intRightRotate38_64(x1);
+    x2 ^= intRightRotate1_64(x2)  ^ intRightRotate6_64(x2);
+    x3 ^= intRightRotate10_64(x3) ^ intRightRotate17_64(x3);
+    x4 ^= intRightRotate7_64(x4)  ^ intRightRotate40_64(x4);
+
+    /* Write the local variables back to the state */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    state->S[0] = x0;
+    state->S[1] = x1;
+    state->S[2] = x2;
+    state->S[3] = x3;
+    state->S[4] = x4;
+#else
+    le_store_word64(state->B,      x0);
+    le_store_word64(state->B +  8, x1);
+    le_store_word64(state->B + 16, x2);
+    le_store_word64(state->B + 24, x3);
+    le_store_word64(state->B + 32, x4);
+#endif
+}
+
+static void gascon128_g0(drysponge128_state_t *state){
+	gascon128_core_round(&(state->c), 0);
+}
+#endif
+
+void gascon256_core_round(gascon256_state_t *state, uint8_t round)
+{
+    uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8;
+
+    /* Load the state into local varaibles */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    uint64_t x0 = state->S[0];
+    uint64_t x1 = state->S[1];
+    uint64_t x2 = state->S[2];
+    uint64_t x3 = state->S[3];
+    uint64_t x4 = state->S[4];
+    uint64_t x5 = state->S[5];
+    uint64_t x6 = state->S[6];
+    uint64_t x7 = state->S[7];
+    uint64_t x8 = state->S[8];
+#else
+    uint64_t x0 = le_load_word64(state->B);
+    uint64_t x1 = le_load_word64(state->B + 8);
+    uint64_t x2 = le_load_word64(state->B + 16);
+    uint64_t x3 = le_load_word64(state->B + 24);
+    uint64_t x4 = le_load_word64(state->B + 32);
+    uint64_t x5 = le_load_word64(state->B + 40);
+    uint64_t x6 = le_load_word64(state->B + 48);
+    uint64_t x7 = le_load_word64(state->B + 56);
+    uint64_t x8 = le_load_word64(state->B + 64);
+#endif
+
+    /* Add the round constant to the middle of the state */
+    x4 ^= ((0x0F - round) << 4) | round;
+
+    /* Substitution layer */
+    x0 ^= x8; x2 ^= x1; x4 ^= x3; x6 ^= x5; x8 ^= x7; t0 = (~x0) & x1;
+    t1 = (~x1) & x2; t2 = (~x2) & x3; t3 = (~x3) & x4; t4 = (~x4) & x5;
+    t5 = (~x5) & x6; t6 = (~x6) & x7; t7 = (~x7) & x8; t8 = (~x8) & x0;
+    x0 ^= t1; x1 ^= t2; x2 ^= t3; x3 ^= t4; x4 ^= t5; x5 ^= t6; x6 ^= t7;
+    x7 ^= t8; x8 ^= t0; x1 ^= x0; x3 ^= x2; x5 ^= x4; x7 ^= x6; x0 ^= x8;
+    x4 = ~x4;
+
+    /* Linear diffusion layer */
+    x0 ^= intRightRotate19_64(x0) ^ intRightRotate28_64(x0);
+    x1 ^= intRightRotate61_64(x1) ^ intRightRotate38_64(x1);
+    x2 ^= intRightRotate1_64(x2)  ^ intRightRotate6_64(x2);
+    x3 ^= intRightRotate10_64(x3) ^ intRightRotate17_64(x3);
+    x4 ^= intRightRotate7_64(x4)  ^ intRightRotate40_64(x4);
+    x5 ^= intRightRotate31_64(x5) ^ intRightRotate26_64(x5);
+    x6 ^= intRightRotate53_64(x6) ^ intRightRotate58_64(x6);
+    x7 ^= intRightRotate9_64(x7)  ^ intRightRotate46_64(x7);
+    x8 ^= intRightRotate43_64(x8) ^ intRightRotate50_64(x8);
+
+    /* Write the local variables back to the state */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    state->S[0] = x0;
+    state->S[1] = x1;
+    state->S[2] = x2;
+    state->S[3] = x3;
+    state->S[4] = x4;
+    state->S[5] = x5;
+    state->S[6] = x6;
+    state->S[7] = x7;
+    state->S[8] = x8;
+#else
+    le_store_word64(state->B,      x0);
+    le_store_word64(state->B +  8, x1);
+    le_store_word64(state->B + 16, x2);
+    le_store_word64(state->B + 24, x3);
+    le_store_word64(state->B + 32, x4);
+    le_store_word64(state->B + 40, x5);
+    le_store_word64(state->B + 48, x6);
+    le_store_word64(state->B + 56, x7);
+    le_store_word64(state->B + 64, x8);
+#endif
+}
+
+#ifdef DRYGASCON_G_OPT
+void DRYGASCON_G_OPT(uint64_t* state, uint32_t rounds);
+//use state only to access c,r,x
+static void drysponge128_g_impl(drysponge128_state_t *state,unsigned int rounds)
+{
+    DRYGASCON_G_OPT((uint64_t*)state,rounds);
+}
+#else
+
+//use state only to access c,r,x
+static void drysponge128_g_impl(drysponge128_state_t *state,unsigned int rounds)
+{
+    unsigned round;
+
+    /* Perform the first round.  For each round we XOR the 16 bytes of
+     * the output data with the first 16 bytes of the state.  And then
+     * XOR with the next 16 bytes of the state, rotated by 4 bytes */
+    gascon128_core_round(&(state->c), 0);
+    state->r.W[0] = state->c.W[0] ^ state->c.W[5];
+    state->r.W[1] = state->c.W[1] ^ state->c.W[6];
+    state->r.W[2] = state->c.W[2] ^ state->c.W[7];
+    state->r.W[3] = state->c.W[3] ^ state->c.W[4];
+
+    /* Perform the rest of the rounds */
+    for (round = 1; round < rounds; ++round) {
+        gascon128_core_round(&(state->c), round);
+        state->r.W[0] ^= state->c.W[0] ^ state->c.W[5];
+        state->r.W[1] ^= state->c.W[1] ^ state->c.W[6];
+        state->r.W[2] ^= state->c.W[2] ^ state->c.W[7];
+        state->r.W[3] ^= state->c.W[3] ^ state->c.W[4];
+    }
+}
+#endif
+void drysponge128_g(drysponge128_state_t *state)
+{
+    drysponge128_g_impl(state,state->rounds);
+}
+
+void drysponge256_g(drysponge256_state_t *state)
+{
+    unsigned round;
+
+    /* Perform the first round.  For each round we XOR the 16 bytes of
+     * the output data with the first 16 bytes of the state.  And then
+     * XOR with the next 16 bytes of the state, rotated by 4 bytes.
+     * And so on for a total of 64 bytes XOR'ed into the output data. */
+    gascon256_core_round(&(state->c), 0);
+    state->r.W[0] = state->c.W[0]  ^ state->c.W[5] ^
+                    state->c.W[10] ^ state->c.W[15];
+    state->r.W[1] = state->c.W[1]  ^ state->c.W[6] ^
+                    state->c.W[11] ^ state->c.W[12];
+    state->r.W[2] = state->c.W[2]  ^ state->c.W[7] ^
+                    state->c.W[8]  ^ state->c.W[13];
+    state->r.W[3] = state->c.W[3]  ^ state->c.W[4] ^
+                    state->c.W[9]  ^ state->c.W[14];
+
+    /* Perform the rest of the rounds */
+    for (round = 1; round < state->rounds; ++round) {
+        gascon256_core_round(&(state->c), round);
+        state->r.W[0] ^= state->c.W[0]  ^ state->c.W[5] ^
+                         state->c.W[10] ^ state->c.W[15];
+        state->r.W[1] ^= state->c.W[1]  ^ state->c.W[6] ^
+                         state->c.W[11] ^ state->c.W[12];
+        state->r.W[2] ^= state->c.W[2]  ^ state->c.W[7] ^
+                         state->c.W[8]  ^ state->c.W[13];
+        state->r.W[3] ^= state->c.W[3]  ^ state->c.W[4] ^
+                         state->c.W[9]  ^ state->c.W[14];
+    }
+}
+
+#endif /* !__AVR__ */
+
+#ifndef DRYGASCON_G_OPT
+void drysponge128_g_core(drysponge128_state_t *state)
+{
+    unsigned round;
+    for (round = 0; round < state->rounds; ++round)
+        gascon128_core_round(&(state->c), round);
+}
+#endif
+
+void drysponge256_g_core(drysponge256_state_t *state)
+{
+    unsigned round;
+    for (round = 0; round < state->rounds; ++round)
+        gascon256_core_round(&(state->c), round);
+}
+
+/**
+ * \fn uint32_t drysponge_select_x(const uint32_t x[4], uint8_t index)
+ * \brief Selects an element of x in constant time.
+ *
+ * \param x Points to the four elements of x.
+ * \param index Index of which element to extract between 0 and 3.
+ *
+ * \return The selected element of x.
+ */
+#if defined(__HAS_CACHE__)
+STATIC_INLINE uint32_t drysponge_select_x(const uint32_t x[4], uint8_t index)
+{
+    /* We need to be careful how we select each element of x because
+     * we are doing a data-dependent fetch here.  Do the fetch in a way
+     * that should avoid cache timing issues by fetching every element
+     * of x and masking away the ones we don't want.
+     *
+     * There is a possible side channel here with respect to power analysis.
+     * The "mask" value will be all-ones for the selected index and all-zeroes
+     * for the other indexes.  This may show up as different power consumption
+     * for the "result ^= x[i] & mask" statement when i is the selected index.
+     * Such a side channel could in theory allow reading the plaintext input
+     * to the cipher by analysing the CPU's power consumption.
+     *
+     * The DryGASCON specification acknowledges the possibility of plaintext
+     * recovery in section 7.4.  For software mitigation the specification
+     * suggests randomization of the indexes into c and x and randomization
+     * of the order of processing words.  We aren't doing that here yet.
+     * Patches welcome to fix this.
+     */
+    uint32_t mask = -((uint32_t)((0x04 - index) >> 2));
+    uint32_t result = x[0] & mask;
+    mask = -((uint32_t)((0x04 - (index ^ 0x01)) >> 2));
+    result ^= x[1] & mask;
+    mask = -((uint32_t)((0x04 - (index ^ 0x02)) >> 2));
+    result ^= x[2] & mask;
+    mask = -((uint32_t)((0x04 - (index ^ 0x03)) >> 2));
+    return result ^ (x[3] & mask);
+}
+#else
+/* AVR is more or less immune to cache timing issues because it doesn't
+ * have anything like an L1 or L2 cache.  Select the word directly */
+#define drysponge_select_x(x, index) ((x)[(index)])
+#endif
+
+#ifndef DRYGASCON_F_OPT
+/**
+ * \brief Mixes a 32-bit value into the DrySPONGE128 state.
+ *
+ * \param state DrySPONGE128 state.
+ * \param data The data to be mixed in the bottom 10 bits.
+ */
+static void drysponge128_mix_phase_round
+    (drysponge128_state_t *state, uint32_t data)
+{
+    /* Mix in elements from x according to the 2-bit indexes in the data */
+    state->c.W[0] ^= drysponge_select_x(state->x.W, data & 0x03);
+    state->c.W[2] ^= drysponge_select_x(state->x.W, (data >> 2) & 0x03);
+    state->c.W[4] ^= drysponge_select_x(state->x.W, (data >> 4) & 0x03);
+    state->c.W[6] ^= drysponge_select_x(state->x.W, (data >> 6) & 0x03);
+    state->c.W[8] ^= drysponge_select_x(state->x.W, (data >> 8) & 0x03);
+}
+#endif
+
+/**
+ * \brief Mixes a 32-bit value into the DrySPONGE256 state.
+ *
+ * \param state DrySPONGE256 state.
+ * \param data The data to be mixed in the bottom 18 bits.
+ */
+static void drysponge256_mix_phase_round
+    (drysponge256_state_t *state, uint32_t data)
+{
+    /* Mix in elements from x according to the 2-bit indexes in the data */
+    state->c.W[0]  ^= drysponge_select_x(state->x.W, data & 0x03);
+    state->c.W[2]  ^= drysponge_select_x(state->x.W, (data >>  2) & 0x03);
+    state->c.W[4]  ^= drysponge_select_x(state->x.W, (data >>  4) & 0x03);
+    state->c.W[6]  ^= drysponge_select_x(state->x.W, (data >>  6) & 0x03);
+    state->c.W[8]  ^= drysponge_select_x(state->x.W, (data >>  8) & 0x03);
+    state->c.W[10] ^= drysponge_select_x(state->x.W, (data >> 10) & 0x03);
+    state->c.W[12] ^= drysponge_select_x(state->x.W, (data >> 12) & 0x03);
+    state->c.W[14] ^= drysponge_select_x(state->x.W, (data >> 14) & 0x03);
+    state->c.W[16] ^= drysponge_select_x(state->x.W, (data >> 16) & 0x03);
+}
+
+#ifndef DRYGASCON_F_OPT
+/**
+ * \brief Mixes an input block into a DrySPONGE128 state.
+ *
+ * \param state The DrySPONGE128 state.
+ * \param data Full rate block containing the input data.
+ */
+static void drysponge128_mix_phase
+    (drysponge128_state_t *state, const unsigned char data[DRYSPONGE128_RATE],unsigned int ds)
+{
+    /* Mix 10-bit groups into the output, with the domain
+     * separator added to the last two groups */
+    drysponge128_mix_phase_round
+        (state, data[0] | (((uint32_t)(data[1])) << 8));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[1] >> 2) | (((uint32_t)(data[2])) << 6));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[2] >> 4) | (((uint32_t)(data[3])) << 4));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[3] >> 6) | (((uint32_t)(data[4])) << 2));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, data[5] | (((uint32_t)(data[6])) << 8));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[6] >> 2) | (((uint32_t)(data[7])) << 6));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[7] >> 4) | (((uint32_t)(data[8])) << 4));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[8] >> 6) | (((uint32_t)(data[9])) << 2));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, data[10] | (((uint32_t)(data[11])) << 8));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[11] >> 2) | (((uint32_t)(data[12])) << 6));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[12] >> 4) | (((uint32_t)(data[13])) << 4));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, ((data[13] >> 6) | (((uint32_t)(data[14])) << 2)));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round(state, data[15] ^ ds);
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round(state, ds >> 10);
+}
+#endif
+
+/**
+ * \brief Mixes an input block into a DrySPONGE256 state.
+ *
+ * \param state The DrySPONGE256 state.
+ * \param data Full rate block containing the input data.
+ */
+static void drysponge256_mix_phase
+    (drysponge256_state_t *state, const unsigned char data[DRYSPONGE256_RATE])
+{
+    /* Mix 18-bit groups into the output, with the domain in the last group */
+    drysponge256_mix_phase_round
+        (state, data[0] | (((uint32_t)(data[1])) << 8) |
+                (((uint32_t)(data[2])) << 16));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[2] >> 2) | (((uint32_t)(data[3])) << 6) |
+                (((uint32_t)(data[4])) << 14));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[4] >> 4) | (((uint32_t)(data[5])) << 4) |
+                (((uint32_t)(data[6])) << 12));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[6] >> 6) | (((uint32_t)(data[7])) << 2) |
+                (((uint32_t)(data[8])) << 10));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, data[9] | (((uint32_t)(data[10])) << 8) |
+                (((uint32_t)(data[11])) << 16));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[11] >> 2) | (((uint32_t)(data[12])) << 6) |
+                (((uint32_t)(data[13])) << 14));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[13] >> 4) | (((uint32_t)(data[14])) << 4) |
+                (((uint32_t)(data[15])) << 12));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[15] >> 6) ^ state->domain);
+
+    /* Revert to the default domain separator for the next block */
+    state->domain = 0;
+}
+
+#ifdef DRYGASCON_F_OPT
+void DRYGASCON_F_OPT(drysponge128_state_t *state, const unsigned char *input,unsigned int ds, unsigned int rounds);
+static void drygascon128_f_impl(drysponge128_state_t *state, const unsigned char *input,unsigned int ds, unsigned int rounds){
+    DRYGASCON_F_OPT(state, input, ds, rounds);
+}
+#else
+void drygascon128_f_impl(drysponge128_state_t *state, const unsigned char *input,unsigned int ds, unsigned int rounds){
+    drysponge128_mix_phase(state, input ,ds);
+    drysponge128_g_impl(state,rounds);
+}
+#endif
+void drygascon128_f_wrap(drysponge128_state_t *state, const unsigned char *input, unsigned len){
+    drysponge128_rate_t padded;//enforce alignement (if needed by f_impl)
+    const unsigned char*in;
+    if (len < DRYSPONGE128_RATE) {
+        memcpy(padded.B, input, len);
+        padded.B[len] = 0x01;
+        memset(padded.B + len + 1, 0, DRYSPONGE128_RATE - len - 1);
+        in=padded.B;
+    } else {
+		#ifdef DRYGASCON_ALIGN_INPUT_32
+        memcpy(padded.B,input,DRYSPONGE128_RATE);
+        in=padded.B;
+		#else
+        in=input;
+		#endif
+    }
+    drygascon128_f_impl(state, in,state->domain,state->rounds);
+    /* Revert to the default domain separator for the next block */
+    state->domain = 0;
+}
+
+void drysponge256_f_absorb
+    (drysponge256_state_t *state, const unsigned char *input, unsigned len)
+{
+    if (len >= DRYSPONGE256_RATE) {
+        drysponge256_mix_phase(state, input);
+    } else {
+        unsigned char padded[DRYSPONGE256_RATE];
+        memcpy(padded, input, len);
+        padded[len] = 0x01;
+        memset(padded + len + 1, 0, DRYSPONGE256_RATE - len - 1);
+        drysponge256_mix_phase(state, padded);
+    }
+}
+
+/**
+ * \brief Determine if some of the words of an "x" value are identical.
+ *
+ * \param x Points to the "x" buffer to check.
+ *
+ * \return Non-zero if some of the words are the same, zero if they are
+ * distinct from each other.
+ *
+ * We try to perform the check in constant time to avoid giving away
+ * any information about the value of the key.
+ */
+static int drysponge_x_words_are_same(const uint32_t x[4])
+{
+    unsigned i, j;
+    int result = 0;
+    for (i = 0; i < 3; ++i) {
+        for (j = i + 1; j < 4; ++j) {
+            uint32_t check = x[i] ^ x[j];
+            result |= (int)((0x100000000ULL - check) >> 32);
+        }
+    }
+    return result;
+}
+
+
+int drysponge128_safe_alignement(const drysponge128_state_t*state){
+	return 0==(0xF & (uintptr_t )&(state->x));
+}
+
+void drysponge128_setup
+    (drysponge128_state_t *state, const unsigned char *key, unsigned int keysize,
+     const unsigned char *nonce, int final_block)
+{
+	if(DRYGASCON128_SAFEKEY_SIZE==keysize){
+		/* Fill C and X directly with the key */
+		memcpy(state->c.B, key, sizeof(state->c));
+		memcpy(state->x.B, key+ sizeof(state->c), sizeof(state->x));
+		while (drysponge_x_words_are_same(state->x.W)); //block here if the key is not valid
+
+	} else {
+		/* Fill the GASCON-128 state with repeated copies of the key */
+		memcpy(state->c.B, key, 16);
+		memcpy(state->c.B + 16, key, 16);
+		memcpy(state->c.B + 32, key, 8);
+
+		if(DRYGASCON128_FASTKEY_SIZE==keysize){
+
+			/* Fill X with the 16 last bytes of the key */
+			memcpy(state->x.B, key+16, sizeof(state->x));
+			while (drysponge_x_words_are_same(state->x.W)); //block here if the key is not valid
+
+		} else if(DRYGASCON128_MINKEY_SIZE==keysize){
+
+			/* Generate the "x" value for the state.  All four words of "x"
+			 * must be unique because they will be used in drysponge_select_x()
+			 * as stand-ins for the bit pairs 00, 01, 10, and 11.
+			 *
+			 * Run the core block operation over and over until "x" is unique.
+			 * Technically the runtime here is key-dependent and not constant.
+			 * If the input key is randomized, this should only take 1 round
+			 * on average so it is "almost constant time".
+			 */
+			do {
+				//gascon128_core_round(&(state->c), 0);
+				//drysponge128_g_impl(state,1);
+				gascon128_g0(state);
+			} while (drysponge_x_words_are_same(state->c.W));
+			memcpy(state->x.W, state->c.W, sizeof(state->x));
+
+			/* Replace the generated "x" value in the state with the key prefix */
+			memcpy(state->c.W, key, sizeof(state->x));
+		}
+	}
+
+    /* Absorb the nonce into the state with an increased number of rounds */
+    state->rounds = DRYSPONGE128_INIT_ROUNDS;
+    state->domain = DRYDOMAIN128_NONCE;
+    if (final_block)
+        state->domain |= DRYDOMAIN128_FINAL;
+    drygascon128_f_wrap(state, nonce, 16);
+
+    /* Set up the normal number of rounds for future operations */
+    state->rounds = DRYSPONGE128_ROUNDS;
+}
+
+void drysponge256_setup
+    (drysponge256_state_t *state, const unsigned char *key,
+     const unsigned char *nonce, int final_block)
+{
+    /* Fill the GASCON-256 state with repeated copies of the key */
+    memcpy(state->c.B, key, 32);
+    memcpy(state->c.B + 32, key, 32);
+    memcpy(state->c.B + 64, key, 8);
+
+    /* Generate the "x" value for the state */
+    do {
+        gascon256_core_round(&(state->c), 0);
+    } while (drysponge_x_words_are_same(state->c.W));
+    memcpy(state->x.W, state->c.W, sizeof(state->x));
+
+    /* Replace the generated "x" value in the state with the key prefix */
+    memcpy(state->c.W, key, sizeof(state->x));
+
+    /* Absorb the nonce into the state with an increased number of rounds */
+    state->rounds = DRYSPONGE256_INIT_ROUNDS;
+    state->domain = DRYDOMAIN256_NONCE;
+    if (final_block)
+        state->domain |= DRYDOMAIN256_FINAL;
+    drysponge256_f_absorb(state, nonce, 16);
+    drysponge256_g(state);
+
+    /* Set up the normal number of rounds for future operations */
+    state->rounds = DRYSPONGE256_ROUNDS;
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LW_INTERNAL_DRYSPONGE_H
+#define LW_INTERNAL_DRYSPONGE_H
+
+#include "drygascon.h"
+#include "drygascon128_arm_selector.h"
+
+#include "internal-util.h"
+
+/**
+ * \file internal-drysponge.h
+ * \brief Internal implementation of DrySPONGE for the DryGASCON cipher.
+ *
+ * References: https://github.com/sebastien-riou/DryGASCON
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief Size of the GASCON-128 permutation state in bytes.
+ */
+#define GASCON128_STATE_SIZE 40
+
+/**
+ * \brief Size of the GASCON-256 permutation state in bytes.
+ */
+#define GASCON256_STATE_SIZE 72
+
+/**
+ * \brief Rate of absorption and squeezing for DrySPONGE128.
+ */
+#define DRYSPONGE128_RATE 16
+
+/**
+ * \brief Rate of absorption and squeezing for DrySPONGE256.
+ */
+#define DRYSPONGE256_RATE 16
+
+/**
+ * \brief Size of the "x" value for DrySPONGE128.
+ */
+#define DRYSPONGE128_XSIZE 16
+
+/**
+ * \brief Size of the "x" value for DrySPONGE256.
+ */
+#define DRYSPONGE256_XSIZE 16
+
+/**
+ * \brief Normal number of rounds for DrySPONGE128 when absorbing
+ * and squeezing data.
+ */
+#define DRYSPONGE128_ROUNDS 7
+
+/**
+ * \brief Number of rounds for DrySPONGE128 during initialization.
+ */
+#define DRYSPONGE128_INIT_ROUNDS 11
+
+/**
+ * \brief Normal number of rounds for DrySPONGE256 when absorbing
+ * and squeezing data.
+ */
+#define DRYSPONGE256_ROUNDS 8
+
+/**
+ * \brief Number of rounds for DrySPONGE256 during initialization.
+ */
+#define DRYSPONGE256_INIT_ROUNDS 12
+
+#ifdef DRYGASCON_F_OPT
+
+    /**
+     * \brief DrySPONGE128 domain bit for a padded block.
+     */
+    #define DRYDOMAIN128_PADDED (1 << 0)
+
+    /**
+     * \brief DrySPONGE128 domain bit for a final block.
+     */
+    #define DRYDOMAIN128_FINAL (1 << 1)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the nonce.
+     */
+    #define DRYDOMAIN128_NONCE (1 << 2)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the associated data.
+     */
+    #define DRYDOMAIN128_ASSOC_DATA (2 << 2)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the message.
+     */
+    #define DRYDOMAIN128_MESSAGE (3 << 2)
+
+#else
+
+    /**
+     * \brief DrySPONGE128 domain bit for a padded block.
+     */
+    #define DRYDOMAIN128_PADDED (1 << 8)
+
+    /**
+     * \brief DrySPONGE128 domain bit for a final block.
+     */
+    #define DRYDOMAIN128_FINAL (1 << 9)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the nonce.
+     */
+    #define DRYDOMAIN128_NONCE (1 << 10)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the associated data.
+     */
+    #define DRYDOMAIN128_ASSOC_DATA (2 << 10)
+
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the message.
+     */
+    #define DRYDOMAIN128_MESSAGE (3 << 10)
+
+#endif
+
+
+/**
+ * \brief DrySPONGE256 domain bit for a padded block.
+ */
+#define DRYDOMAIN256_PADDED (1 << 2)
+
+/**
+ * \brief DrySPONGE256 domain bit for a final block.
+ */
+#define DRYDOMAIN256_FINAL (1 << 3)
+
+/**
+ * \brief DrySPONGE256 domain value for processing the nonce.
+ */
+#define DRYDOMAIN256_NONCE (1 << 4)
+
+/**
+ * \brief DrySPONGE256 domain value for processing the associated data.
+ */
+#define DRYDOMAIN256_ASSOC_DATA (2 << 4)
+
+/**
+ * \brief DrySPONGE256 domain value for processing the message.
+ */
+#define DRYDOMAIN256_MESSAGE (3 << 4)
+
+/**
+ * \brief Internal state of the GASCON-128 permutation.
+ */
+typedef union
+{
+    uint64_t S[GASCON128_STATE_SIZE / 8];   /**< 64-bit words of the state */
+    uint32_t W[GASCON128_STATE_SIZE / 4];   /**< 32-bit words of the state */
+    uint8_t B[GASCON128_STATE_SIZE];        /**< Bytes of the state */
+
+} gascon128_state_t;
+
+/**
+ * \brief Internal state of the GASCON-256 permutation.
+ */
+typedef union
+{
+    uint64_t S[GASCON256_STATE_SIZE / 8];   /**< 64-bit words of the state */
+    uint32_t W[GASCON256_STATE_SIZE / 4];   /**< 32-bit words of the state */
+    uint8_t B[GASCON256_STATE_SIZE];        /**< Bytes of the state */
+
+} gascon256_state_t;
+
+/**
+ * \brief Structure of a rate block for DrySPONGE128.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE128_RATE / 8];      /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE128_RATE / 4];      /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE128_RATE];           /**< Bytes of the rate */
+
+} drysponge128_rate_t;
+
+/**
+ * \brief Structure of a rate block for DrySPONGE256.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE256_RATE / 8];  /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE256_RATE / 4];  /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE256_RATE];       /**< Bytes of the rate */
+
+} drysponge256_rate_t;
+
+/**
+ * \brief Structure of the "x" value for DrySPONGE128.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE128_XSIZE / 8]; /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE128_XSIZE / 4]; /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE128_XSIZE];      /**< Bytes of the rate */
+
+} __attribute__((aligned(16))) drysponge128_x_t;
+
+/**
+ * \brief Structure of the "x" value for DrySPONGE256.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE256_XSIZE / 8]; /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE256_XSIZE / 4]; /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE256_XSIZE];      /**< Bytes of the rate */
+
+} drysponge256_x_t;
+
+/**
+ * \brief Structure of the rolling DrySPONGE128 state.
+ */
+typedef struct
+{
+	  gascon128_state_t c;        /**< GASCON-128 state for the capacity */
+    uint32_t domain;            /**< Domain value to mix on next F call */
+    uint32_t rounds;            /**< Number of rounds for next G call */
+    drysponge128_rate_t r;      /**< Buffer for a rate block of data */
+    drysponge128_x_t x;         /**< "x" value for the sponge */
+} __attribute__((aligned(16))) drysponge128_state_t;
+
+/**
+ * \brief Structure of the rolling DrySPONGE256 state.
+ */
+typedef struct
+{
+    gascon256_state_t c;        /**< GASCON-256 state for the capacity */
+    drysponge256_rate_t r;      /**< Buffer for a rate block of data */
+    drysponge256_x_t x;         /**< "x" value for the sponge */
+    uint32_t domain;            /**< Domain value to mix on next F call */
+    uint32_t rounds;            /**< Number of rounds for next G call */
+
+} drysponge256_state_t;
+
+/**
+ * \brief Permutes the GASCON-128 state using one iteration of CoreRound.
+ *
+ * \param state The GASCON-128 state to be permuted.
+ * \param round The round number.
+ *
+ * The input and output \a state will be in little-endian byte order.
+ */
+void gascon128_core_round(gascon128_state_t *state, uint8_t round);
+
+/**
+ * \brief Permutes the GASCON-256 state using one iteration of CoreRound.
+ *
+ * \param state The GASCON-256 state to be permuted.
+ * \param round The round number.
+ *
+ * The input and output \a state will be in little-endian byte order.
+ */
+void gascon256_core_round(gascon256_state_t *state, uint8_t round);
+
+/**
+ * \brief Performs the DrySPONGE128 G function which runs the core
+ * rounds and squeezes data out of the GASGON-128 state.
+ *
+ * \param state The DrySPONGE128 state.
+ *
+ * The data that is squeezed out will be in state->r on exit.
+ */
+void drysponge128_g(drysponge128_state_t *state);
+
+/**
+ * \brief Performs the DrySPONGE256 G function which runs the core
+ * rounds and squeezes data out of the GASGON-256 state.
+ *
+ * \param state The DrySPONGE256 state.
+ *
+ * The data that is squeezed out will be in state->r on exit.
+ */
+void drysponge256_g(drysponge256_state_t *state);
+
+/**
+ * \brief Performs the DrySPONGE128 G function which runs the core
+ * rounds but does not squeeze out any output.
+ *
+ * \param state The DrySPONGE128 state.
+ */
+void drysponge128_g_core(drysponge128_state_t *state);
+
+/**
+ * \brief Performs the DrySPONGE256 G function which runs the core
+ * rounds but does not squeeze out any output.
+ *
+ * \param state The DrySPONGE256 state.
+ */
+void drysponge256_g_core(drysponge256_state_t *state);
+
+/**
+ * \brief Performs the absorption phase of the DrySPONGE256 F function.
+ *
+ * \param state The DrySPONGE256 state.
+ * \param input The block of input data to incorporate into the state.
+ * \param len The length of the input block, which must be less than
+ * or equal to DRYSPONGE256_RATE.  Smaller input blocks will be padded.
+ *
+ * This function must be followed by a call to drysponge256_g() or
+ * drysponge256_g_core() to perform the full F operation.
+ */
+void drysponge256_f_absorb
+    (drysponge256_state_t *state, const unsigned char *input, unsigned len);
+
+void drygascon128_f_wrap(drysponge128_state_t *state, const unsigned char *input, unsigned len);
+
+/**
+ * \brief Determine if state alignement is safe vs timing attacks.
+ *
+ * \param state Points to the state to check.
+ *
+ * \return Non-zero if alignement is safe.
+ *
+ * We expect this to be completly optimized out by compiler if the alignement is enforced at build time
+ */
+int drysponge128_safe_alignement(const drysponge128_state_t*state);
+
+/**
+ * \brief Set up a DrySPONGE128 state to begin encryption or decryption.
+ *
+ * \param state The DrySPONGE128 state.
+ * \param key Points to the 16 bytes of the key.
+ * \param nonce Points to the 16 bytes of the nonce.
+ * \param final_block Non-zero if after key setup there will be no more blocks.
+ */
+void drysponge128_setup
+    (drysponge128_state_t *state, const unsigned char *key, unsigned int keysize,
+     const unsigned char *nonce, int final_block);
+
+/**
+ * \brief Set up a DrySPONGE256 state to begin encryption or decryption.
+ *
+ * \param state The DrySPONGE256 state.
+ * \param key Points to the 32 bytes of the key.
+ * \param nonce Points to the 16 bytes of the nonce.
+ * \param final_block Non-zero if after key setup there will be no more blocks.
+ */
+void drysponge256_setup
+    (drysponge256_state_t *state, const unsigned char *key,
+     const unsigned char *nonce, int final_block);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-util.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-util.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LW_INTERNAL_UTIL_H
+#define LW_INTERNAL_UTIL_H
+
+#include <stdint.h>
+
+/* Figure out how to inline functions using this C compiler */
+#if defined(__STDC__) && __STDC_VERSION__ >= 199901L
+#define STATIC_INLINE static inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define STATIC_INLINE static __inline__
+#else
+#define STATIC_INLINE static
+#endif
+
+/* Try to figure out whether the CPU is little-endian or big-endian.
+ * May need to modify this to include new compiler-specific defines.
+ * Alternatively, define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ in your
+ * compiler flags when you compile this library */
+#if defined(__x86_64) || defined(__x86_64__) || \
+    defined(__i386) || defined(__i386__) || \
+    defined(__AVR__) || defined(__arm) || defined(__arm__) || \
+    defined(_M_AMD64) || defined(_M_X64) || defined(_M_IX86) || \
+    defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM_FP) || \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == 1234) || \
+    defined(__LITTLE_ENDIAN__)
+#define LW_UTIL_LITTLE_ENDIAN 1
+#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == 4321) || \
+    defined(__BIG_ENDIAN__)
+/* Big endian */
+#else
+#error "Cannot determine the endianess of this platform"
+#endif
+
+/* Helper macros to load and store values while converting endian-ness */
+
+/* Load a big-endian 32-bit word from a byte buffer */
+#define be_load_word32(ptr) \
+    ((((uint32_t)((ptr)[0])) << 24) | \
+     (((uint32_t)((ptr)[1])) << 16) | \
+     (((uint32_t)((ptr)[2])) << 8) | \
+      ((uint32_t)((ptr)[3])))
+
+/* Store a big-endian 32-bit word into a byte buffer */
+#define be_store_word32(ptr, x) \
+    do { \
+        uint32_t _x = (x); \
+        (ptr)[0] = (uint8_t)(_x >> 24); \
+        (ptr)[1] = (uint8_t)(_x >> 16); \
+        (ptr)[2] = (uint8_t)(_x >> 8); \
+        (ptr)[3] = (uint8_t)_x; \
+    } while (0)
+
+/* Load a little-endian 32-bit word from a byte buffer */
+#define le_load_word32(ptr) \
+    ((((uint32_t)((ptr)[3])) << 24) | \
+     (((uint32_t)((ptr)[2])) << 16) | \
+     (((uint32_t)((ptr)[1])) << 8) | \
+      ((uint32_t)((ptr)[0])))
+
+/* Store a little-endian 32-bit word into a byte buffer */
+#define le_store_word32(ptr, x) \
+    do { \
+        uint32_t _x = (x); \
+        (ptr)[0] = (uint8_t)_x; \
+        (ptr)[1] = (uint8_t)(_x >> 8); \
+        (ptr)[2] = (uint8_t)(_x >> 16); \
+        (ptr)[3] = (uint8_t)(_x >> 24); \
+    } while (0)
+
+/* Load a big-endian 64-bit word from a byte buffer */
+#define be_load_word64(ptr) \
+    ((((uint64_t)((ptr)[0])) << 56) | \
+     (((uint64_t)((ptr)[1])) << 48) | \
+     (((uint64_t)((ptr)[2])) << 40) | \
+     (((uint64_t)((ptr)[3])) << 32) | \
+     (((uint64_t)((ptr)[4])) << 24) | \
+     (((uint64_t)((ptr)[5])) << 16) | \
+     (((uint64_t)((ptr)[6])) << 8) | \
+      ((uint64_t)((ptr)[7])))
+
+/* Store a big-endian 64-bit word into a byte buffer */
+#define be_store_word64(ptr, x) \
+    do { \
+        uint64_t _x = (x); \
+        (ptr)[0] = (uint8_t)(_x >> 56); \
+        (ptr)[1] = (uint8_t)(_x >> 48); \
+        (ptr)[2] = (uint8_t)(_x >> 40); \
+        (ptr)[3] = (uint8_t)(_x >> 32); \
+        (ptr)[4] = (uint8_t)(_x >> 24); \
+        (ptr)[5] = (uint8_t)(_x >> 16); \
+        (ptr)[6] = (uint8_t)(_x >> 8); \
+        (ptr)[7] = (uint8_t)_x; \
+    } while (0)
+
+/* Load a little-endian 64-bit word from a byte buffer */
+#define le_load_word64(ptr) \
+    ((((uint64_t)((ptr)[7])) << 56) | \
+     (((uint64_t)((ptr)[6])) << 48) | \
+     (((uint64_t)((ptr)[5])) << 40) | \
+     (((uint64_t)((ptr)[4])) << 32) | \
+     (((uint64_t)((ptr)[3])) << 24) | \
+     (((uint64_t)((ptr)[2])) << 16) | \
+     (((uint64_t)((ptr)[1])) << 8) | \
+      ((uint64_t)((ptr)[0])))
+
+/* Store a little-endian 64-bit word into a byte buffer */
+#define le_store_word64(ptr, x) \
+    do { \
+        uint64_t _x = (x); \
+        (ptr)[0] = (uint8_t)_x; \
+        (ptr)[1] = (uint8_t)(_x >> 8); \
+        (ptr)[2] = (uint8_t)(_x >> 16); \
+        (ptr)[3] = (uint8_t)(_x >> 24); \
+        (ptr)[4] = (uint8_t)(_x >> 32); \
+        (ptr)[5] = (uint8_t)(_x >> 40); \
+        (ptr)[6] = (uint8_t)(_x >> 48); \
+        (ptr)[7] = (uint8_t)(_x >> 56); \
+    } while (0)
+
+/* Load a big-endian 16-bit word from a byte buffer */
+#define be_load_word16(ptr) \
+    ((((uint16_t)((ptr)[0])) << 8) | \
+      ((uint16_t)((ptr)[1])))
+
+/* Store a big-endian 16-bit word into a byte buffer */
+#define be_store_word16(ptr, x) \
+    do { \
+        uint16_t _x = (x); \
+        (ptr)[0] = (uint8_t)(_x >> 8); \
+        (ptr)[1] = (uint8_t)_x; \
+    } while (0)
+
+/* Load a little-endian 16-bit word from a byte buffer */
+#define le_load_word16(ptr) \
+    ((((uint16_t)((ptr)[1])) << 8) | \
+      ((uint16_t)((ptr)[0])))
+
+/* Store a little-endian 16-bit word into a byte buffer */
+#define le_store_word16(ptr, x) \
+    do { \
+        uint16_t _x = (x); \
+        (ptr)[0] = (uint8_t)_x; \
+        (ptr)[1] = (uint8_t)(_x >> 8); \
+    } while (0)
+
+/* XOR a source byte buffer against a destination */
+#define lw_xor_block(dest, src, len) \
+    do { \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src = (src); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            *_dest++ ^= *_src++; \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR two source byte buffers and put the result in a destination buffer */
+#define lw_xor_block_2_src(dest, src1, src2, len) \
+    do { \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src1 = (src1); \
+        const unsigned char *_src2 = (src2); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            *_dest++ = *_src1++ ^ *_src2++; \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR a source byte buffer against a destination and write to another
+ * destination at the same time */
+#define lw_xor_block_2_dest(dest2, dest, src, len) \
+    do { \
+        unsigned char *_dest2 = (dest2); \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src = (src); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            *_dest2++ = (*_dest++ ^= *_src++); \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR two byte buffers and write to a destination which at the same
+ * time copying the contents of src2 to dest2 */
+#define lw_xor_block_copy_src(dest2, dest, src1, src2, len) \
+    do { \
+        unsigned char *_dest2 = (dest2); \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src1 = (src1); \
+        const unsigned char *_src2 = (src2); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            unsigned char _temp = *_src2++; \
+            *_dest2++ = _temp; \
+            *_dest++ = *_src1++ ^ _temp; \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR a source byte buffer against a destination and write to another
+ * destination at the same time.  This version swaps the source value
+ * into the "dest" buffer */
+#define lw_xor_block_swap(dest2, dest, src, len) \
+    do { \
+        unsigned char *_dest2 = (dest2); \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src = (src); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            unsigned char _temp = *_src++; \
+            *_dest2++ = *_dest ^ _temp; \
+            *_dest++ = _temp; \
+            --_len; \
+        } \
+    } while (0)
+
+/* Rotation functions need to be optimised for best performance on AVR.
+ * The most efficient rotations are where the number of bits is 1 or a
+ * multiple of 8, so we compose the efficient rotations to produce all
+ * other rotation counts of interest. */
+
+#if defined(__AVR__)
+#define LW_CRYPTO_ROTATE32_COMPOSED 1
+#else
+#define LW_CRYPTO_ROTATE32_COMPOSED 0
+#endif
+
+/* Rotation macros for 32-bit arguments */
+
+/* Generic left rotate */
+#define leftRotate(a, bits) \
+    (__extension__ ({ \
+        uint32_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (32 - (bits))); \
+    }))
+
+/* Generic right rotate */
+#define rightRotate(a, bits) \
+    (__extension__ ({ \
+        uint32_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (32 - (bits))); \
+    }))
+
+#if !LW_CRYPTO_ROTATE32_COMPOSED
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1(a)  (leftRotate((a), 1))
+#define leftRotate2(a)  (leftRotate((a), 2))
+#define leftRotate3(a)  (leftRotate((a), 3))
+#define leftRotate4(a)  (leftRotate((a), 4))
+#define leftRotate5(a)  (leftRotate((a), 5))
+#define leftRotate6(a)  (leftRotate((a), 6))
+#define leftRotate7(a)  (leftRotate((a), 7))
+#define leftRotate8(a)  (leftRotate((a), 8))
+#define leftRotate9(a)  (leftRotate((a), 9))
+#define leftRotate10(a) (leftRotate((a), 10))
+#define leftRotate11(a) (leftRotate((a), 11))
+#define leftRotate12(a) (leftRotate((a), 12))
+#define leftRotate13(a) (leftRotate((a), 13))
+#define leftRotate14(a) (leftRotate((a), 14))
+#define leftRotate15(a) (leftRotate((a), 15))
+#define leftRotate16(a) (leftRotate((a), 16))
+#define leftRotate17(a) (leftRotate((a), 17))
+#define leftRotate18(a) (leftRotate((a), 18))
+#define leftRotate19(a) (leftRotate((a), 19))
+#define leftRotate20(a) (leftRotate((a), 20))
+#define leftRotate21(a) (leftRotate((a), 21))
+#define leftRotate22(a) (leftRotate((a), 22))
+#define leftRotate23(a) (leftRotate((a), 23))
+#define leftRotate24(a) (leftRotate((a), 24))
+#define leftRotate25(a) (leftRotate((a), 25))
+#define leftRotate26(a) (leftRotate((a), 26))
+#define leftRotate27(a) (leftRotate((a), 27))
+#define leftRotate28(a) (leftRotate((a), 28))
+#define leftRotate29(a) (leftRotate((a), 29))
+#define leftRotate30(a) (leftRotate((a), 30))
+#define leftRotate31(a) (leftRotate((a), 31))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1(a)  (rightRotate((a), 1))
+#define rightRotate2(a)  (rightRotate((a), 2))
+#define rightRotate3(a)  (rightRotate((a), 3))
+#define rightRotate4(a)  (rightRotate((a), 4))
+#define rightRotate5(a)  (rightRotate((a), 5))
+#define rightRotate6(a)  (rightRotate((a), 6))
+#define rightRotate7(a)  (rightRotate((a), 7))
+#define rightRotate8(a)  (rightRotate((a), 8))
+#define rightRotate9(a)  (rightRotate((a), 9))
+#define rightRotate10(a) (rightRotate((a), 10))
+#define rightRotate11(a) (rightRotate((a), 11))
+#define rightRotate12(a) (rightRotate((a), 12))
+#define rightRotate13(a) (rightRotate((a), 13))
+#define rightRotate14(a) (rightRotate((a), 14))
+#define rightRotate15(a) (rightRotate((a), 15))
+#define rightRotate16(a) (rightRotate((a), 16))
+#define rightRotate17(a) (rightRotate((a), 17))
+#define rightRotate18(a) (rightRotate((a), 18))
+#define rightRotate19(a) (rightRotate((a), 19))
+#define rightRotate20(a) (rightRotate((a), 20))
+#define rightRotate21(a) (rightRotate((a), 21))
+#define rightRotate22(a) (rightRotate((a), 22))
+#define rightRotate23(a) (rightRotate((a), 23))
+#define rightRotate24(a) (rightRotate((a), 24))
+#define rightRotate25(a) (rightRotate((a), 25))
+#define rightRotate26(a) (rightRotate((a), 26))
+#define rightRotate27(a) (rightRotate((a), 27))
+#define rightRotate28(a) (rightRotate((a), 28))
+#define rightRotate29(a) (rightRotate((a), 29))
+#define rightRotate30(a) (rightRotate((a), 30))
+#define rightRotate31(a) (rightRotate((a), 31))
+
+#else /* LW_CRYPTO_ROTATE32_COMPOSED */
+
+/* Composed rotation macros where 1 and 8 are fast, but others are slow */
+
+/* Left rotate by 1 */
+#define leftRotate1(a)  (leftRotate((a), 1))
+
+/* Left rotate by 2 */
+#define leftRotate2(a)  (leftRotate(leftRotate((a), 1), 1))
+
+/* Left rotate by 3 */
+#define leftRotate3(a)  (leftRotate(leftRotate(leftRotate((a), 1), 1), 1))
+
+/* Left rotate by 4 */
+#define leftRotate4(a)  (leftRotate(leftRotate(leftRotate(leftRotate((a), 1), 1), 1), 1))
+
+/* Left rotate by 5: Rotate left by 8, then right by 3 */
+#define leftRotate5(a)  (rightRotate(rightRotate(rightRotate(leftRotate((a), 8), 1), 1), 1))
+
+/* Left rotate by 6: Rotate left by 8, then right by 2 */
+#define leftRotate6(a)  (rightRotate(rightRotate(leftRotate((a), 8), 1), 1))
+
+/* Left rotate by 7: Rotate left by 8, then right by 1 */
+#define leftRotate7(a)  (rightRotate(leftRotate((a), 8), 1))
+
+/* Left rotate by 8 */
+#define leftRotate8(a)  (leftRotate((a), 8))
+
+/* Left rotate by 9: Rotate left by 8, then left by 1 */
+#define leftRotate9(a)  (leftRotate(leftRotate((a), 8), 1))
+
+/* Left rotate by 10: Rotate left by 8, then left by 2 */
+#define leftRotate10(a) (leftRotate(leftRotate(leftRotate((a), 8), 1), 1))
+
+/* Left rotate by 11: Rotate left by 8, then left by 3 */
+#define leftRotate11(a) (leftRotate(leftRotate(leftRotate(leftRotate((a), 8), 1), 1), 1))
+
+/* Left rotate by 12: Rotate left by 16, then right by 4 */
+#define leftRotate12(a) (rightRotate(rightRotate(rightRotate(rightRotate(leftRotate((a), 16), 1), 1), 1), 1))
+
+/* Left rotate by 13: Rotate left by 16, then right by 3 */
+#define leftRotate13(a) (rightRotate(rightRotate(rightRotate(leftRotate((a), 16), 1), 1), 1))
+
+/* Left rotate by 14: Rotate left by 16, then right by 2 */
+#define leftRotate14(a) (rightRotate(rightRotate(leftRotate((a), 16), 1), 1))
+
+/* Left rotate by 15: Rotate left by 16, then right by 1 */
+#define leftRotate15(a) (rightRotate(leftRotate((a), 16), 1))
+
+/* Left rotate by 16 */
+#define leftRotate16(a) (leftRotate((a), 16))
+
+/* Left rotate by 17: Rotate left by 16, then left by 1 */
+#define leftRotate17(a) (leftRotate(leftRotate((a), 16), 1))
+
+/* Left rotate by 18: Rotate left by 16, then left by 2 */
+#define leftRotate18(a) (leftRotate(leftRotate(leftRotate((a), 16), 1), 1))
+
+/* Left rotate by 19: Rotate left by 16, then left by 3 */
+#define leftRotate19(a) (leftRotate(leftRotate(leftRotate(leftRotate((a), 16), 1), 1), 1))
+
+/* Left rotate by 20: Rotate left by 16, then left by 4 */
+#define leftRotate20(a) (leftRotate(leftRotate(leftRotate(leftRotate(leftRotate((a), 16), 1), 1), 1), 1))
+
+/* Left rotate by 21: Rotate left by 24, then right by 3 */
+#define leftRotate21(a) (rightRotate(rightRotate(rightRotate(leftRotate((a), 24), 1), 1), 1))
+
+/* Left rotate by 22: Rotate left by 24, then right by 2 */
+#define leftRotate22(a) (rightRotate(rightRotate(leftRotate((a), 24), 1), 1))
+
+/* Left rotate by 23: Rotate left by 24, then right by 1 */
+#define leftRotate23(a) (rightRotate(leftRotate((a), 24), 1))
+
+/* Left rotate by 24 */
+#define leftRotate24(a) (leftRotate((a), 24))
+
+/* Left rotate by 25: Rotate left by 24, then left by 1 */
+#define leftRotate25(a) (leftRotate(leftRotate((a), 24), 1))
+
+/* Left rotate by 26: Rotate left by 24, then left by 2 */
+#define leftRotate26(a) (leftRotate(leftRotate(leftRotate((a), 24), 1), 1))
+
+/* Left rotate by 27: Rotate left by 24, then left by 3 */
+#define leftRotate27(a) (leftRotate(leftRotate(leftRotate(leftRotate((a), 24), 1), 1), 1))
+
+/* Left rotate by 28: Rotate right by 4 */
+#define leftRotate28(a) (rightRotate(rightRotate(rightRotate(rightRotate((a), 1), 1), 1), 1))
+
+/* Left rotate by 29: Rotate right by 3 */
+#define leftRotate29(a) (rightRotate(rightRotate(rightRotate((a), 1), 1), 1))
+
+/* Left rotate by 30: Rotate right by 2 */
+#define leftRotate30(a) (rightRotate(rightRotate((a), 1), 1))
+
+/* Left rotate by 31: Rotate right by 1 */
+#define leftRotate31(a) (rightRotate((a), 1))
+
+/* Define the 32-bit right rotations in terms of left rotations */
+#define rightRotate1(a)  (leftRotate31((a)))
+#define rightRotate2(a)  (leftRotate30((a)))
+#define rightRotate3(a)  (leftRotate29((a)))
+#define rightRotate4(a)  (leftRotate28((a)))
+#define rightRotate5(a)  (leftRotate27((a)))
+#define rightRotate6(a)  (leftRotate26((a)))
+#define rightRotate7(a)  (leftRotate25((a)))
+#define rightRotate8(a)  (leftRotate24((a)))
+#define rightRotate9(a)  (leftRotate23((a)))
+#define rightRotate10(a) (leftRotate22((a)))
+#define rightRotate11(a) (leftRotate21((a)))
+#define rightRotate12(a) (leftRotate20((a)))
+#define rightRotate13(a) (leftRotate19((a)))
+#define rightRotate14(a) (leftRotate18((a)))
+#define rightRotate15(a) (leftRotate17((a)))
+#define rightRotate16(a) (leftRotate16((a)))
+#define rightRotate17(a) (leftRotate15((a)))
+#define rightRotate18(a) (leftRotate14((a)))
+#define rightRotate19(a) (leftRotate13((a)))
+#define rightRotate20(a) (leftRotate12((a)))
+#define rightRotate21(a) (leftRotate11((a)))
+#define rightRotate22(a) (leftRotate10((a)))
+#define rightRotate23(a) (leftRotate9((a)))
+#define rightRotate24(a) (leftRotate8((a)))
+#define rightRotate25(a) (leftRotate7((a)))
+#define rightRotate26(a) (leftRotate6((a)))
+#define rightRotate27(a) (leftRotate5((a)))
+#define rightRotate28(a) (leftRotate4((a)))
+#define rightRotate29(a) (leftRotate3((a)))
+#define rightRotate30(a) (leftRotate2((a)))
+#define rightRotate31(a) (leftRotate1((a)))
+
+#endif /* LW_CRYPTO_ROTATE32_COMPOSED */
+
+/* Rotation macros for 64-bit arguments */
+
+/* Generic left rotate */
+#define leftRotate_64(a, bits) \
+    (__extension__ ({ \
+        uint64_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (64 - (bits))); \
+    }))
+
+/* Generic right rotate */
+#define rightRotate_64(a, bits) \
+    (__extension__ ({ \
+        uint64_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (64 - (bits))); \
+    }))
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1_64(a)  (leftRotate_64((a), 1))
+#define leftRotate2_64(a)  (leftRotate_64((a), 2))
+#define leftRotate3_64(a)  (leftRotate_64((a), 3))
+#define leftRotate4_64(a)  (leftRotate_64((a), 4))
+#define leftRotate5_64(a)  (leftRotate_64((a), 5))
+#define leftRotate6_64(a)  (leftRotate_64((a), 6))
+#define leftRotate7_64(a)  (leftRotate_64((a), 7))
+#define leftRotate8_64(a)  (leftRotate_64((a), 8))
+#define leftRotate9_64(a)  (leftRotate_64((a), 9))
+#define leftRotate10_64(a) (leftRotate_64((a), 10))
+#define leftRotate11_64(a) (leftRotate_64((a), 11))
+#define leftRotate12_64(a) (leftRotate_64((a), 12))
+#define leftRotate13_64(a) (leftRotate_64((a), 13))
+#define leftRotate14_64(a) (leftRotate_64((a), 14))
+#define leftRotate15_64(a) (leftRotate_64((a), 15))
+#define leftRotate16_64(a) (leftRotate_64((a), 16))
+#define leftRotate17_64(a) (leftRotate_64((a), 17))
+#define leftRotate18_64(a) (leftRotate_64((a), 18))
+#define leftRotate19_64(a) (leftRotate_64((a), 19))
+#define leftRotate20_64(a) (leftRotate_64((a), 20))
+#define leftRotate21_64(a) (leftRotate_64((a), 21))
+#define leftRotate22_64(a) (leftRotate_64((a), 22))
+#define leftRotate23_64(a) (leftRotate_64((a), 23))
+#define leftRotate24_64(a) (leftRotate_64((a), 24))
+#define leftRotate25_64(a) (leftRotate_64((a), 25))
+#define leftRotate26_64(a) (leftRotate_64((a), 26))
+#define leftRotate27_64(a) (leftRotate_64((a), 27))
+#define leftRotate28_64(a) (leftRotate_64((a), 28))
+#define leftRotate29_64(a) (leftRotate_64((a), 29))
+#define leftRotate30_64(a) (leftRotate_64((a), 30))
+#define leftRotate31_64(a) (leftRotate_64((a), 31))
+#define leftRotate32_64(a) (leftRotate_64((a), 32))
+#define leftRotate33_64(a) (leftRotate_64((a), 33))
+#define leftRotate34_64(a) (leftRotate_64((a), 34))
+#define leftRotate35_64(a) (leftRotate_64((a), 35))
+#define leftRotate36_64(a) (leftRotate_64((a), 36))
+#define leftRotate37_64(a) (leftRotate_64((a), 37))
+#define leftRotate38_64(a) (leftRotate_64((a), 38))
+#define leftRotate39_64(a) (leftRotate_64((a), 39))
+#define leftRotate40_64(a) (leftRotate_64((a), 40))
+#define leftRotate41_64(a) (leftRotate_64((a), 41))
+#define leftRotate42_64(a) (leftRotate_64((a), 42))
+#define leftRotate43_64(a) (leftRotate_64((a), 43))
+#define leftRotate44_64(a) (leftRotate_64((a), 44))
+#define leftRotate45_64(a) (leftRotate_64((a), 45))
+#define leftRotate46_64(a) (leftRotate_64((a), 46))
+#define leftRotate47_64(a) (leftRotate_64((a), 47))
+#define leftRotate48_64(a) (leftRotate_64((a), 48))
+#define leftRotate49_64(a) (leftRotate_64((a), 49))
+#define leftRotate50_64(a) (leftRotate_64((a), 50))
+#define leftRotate51_64(a) (leftRotate_64((a), 51))
+#define leftRotate52_64(a) (leftRotate_64((a), 52))
+#define leftRotate53_64(a) (leftRotate_64((a), 53))
+#define leftRotate54_64(a) (leftRotate_64((a), 54))
+#define leftRotate55_64(a) (leftRotate_64((a), 55))
+#define leftRotate56_64(a) (leftRotate_64((a), 56))
+#define leftRotate57_64(a) (leftRotate_64((a), 57))
+#define leftRotate58_64(a) (leftRotate_64((a), 58))
+#define leftRotate59_64(a) (leftRotate_64((a), 59))
+#define leftRotate60_64(a) (leftRotate_64((a), 60))
+#define leftRotate61_64(a) (leftRotate_64((a), 61))
+#define leftRotate62_64(a) (leftRotate_64((a), 62))
+#define leftRotate63_64(a) (leftRotate_64((a), 63))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1_64(a)  (rightRotate_64((a), 1))
+#define rightRotate2_64(a)  (rightRotate_64((a), 2))
+#define rightRotate3_64(a)  (rightRotate_64((a), 3))
+#define rightRotate4_64(a)  (rightRotate_64((a), 4))
+#define rightRotate5_64(a)  (rightRotate_64((a), 5))
+#define rightRotate6_64(a)  (rightRotate_64((a), 6))
+#define rightRotate7_64(a)  (rightRotate_64((a), 7))
+#define rightRotate8_64(a)  (rightRotate_64((a), 8))
+#define rightRotate9_64(a)  (rightRotate_64((a), 9))
+#define rightRotate10_64(a) (rightRotate_64((a), 10))
+#define rightRotate11_64(a) (rightRotate_64((a), 11))
+#define rightRotate12_64(a) (rightRotate_64((a), 12))
+#define rightRotate13_64(a) (rightRotate_64((a), 13))
+#define rightRotate14_64(a) (rightRotate_64((a), 14))
+#define rightRotate15_64(a) (rightRotate_64((a), 15))
+#define rightRotate16_64(a) (rightRotate_64((a), 16))
+#define rightRotate17_64(a) (rightRotate_64((a), 17))
+#define rightRotate18_64(a) (rightRotate_64((a), 18))
+#define rightRotate19_64(a) (rightRotate_64((a), 19))
+#define rightRotate20_64(a) (rightRotate_64((a), 20))
+#define rightRotate21_64(a) (rightRotate_64((a), 21))
+#define rightRotate22_64(a) (rightRotate_64((a), 22))
+#define rightRotate23_64(a) (rightRotate_64((a), 23))
+#define rightRotate24_64(a) (rightRotate_64((a), 24))
+#define rightRotate25_64(a) (rightRotate_64((a), 25))
+#define rightRotate26_64(a) (rightRotate_64((a), 26))
+#define rightRotate27_64(a) (rightRotate_64((a), 27))
+#define rightRotate28_64(a) (rightRotate_64((a), 28))
+#define rightRotate29_64(a) (rightRotate_64((a), 29))
+#define rightRotate30_64(a) (rightRotate_64((a), 30))
+#define rightRotate31_64(a) (rightRotate_64((a), 31))
+#define rightRotate32_64(a) (rightRotate_64((a), 32))
+#define rightRotate33_64(a) (rightRotate_64((a), 33))
+#define rightRotate34_64(a) (rightRotate_64((a), 34))
+#define rightRotate35_64(a) (rightRotate_64((a), 35))
+#define rightRotate36_64(a) (rightRotate_64((a), 36))
+#define rightRotate37_64(a) (rightRotate_64((a), 37))
+#define rightRotate38_64(a) (rightRotate_64((a), 38))
+#define rightRotate39_64(a) (rightRotate_64((a), 39))
+#define rightRotate40_64(a) (rightRotate_64((a), 40))
+#define rightRotate41_64(a) (rightRotate_64((a), 41))
+#define rightRotate42_64(a) (rightRotate_64((a), 42))
+#define rightRotate43_64(a) (rightRotate_64((a), 43))
+#define rightRotate44_64(a) (rightRotate_64((a), 44))
+#define rightRotate45_64(a) (rightRotate_64((a), 45))
+#define rightRotate46_64(a) (rightRotate_64((a), 46))
+#define rightRotate47_64(a) (rightRotate_64((a), 47))
+#define rightRotate48_64(a) (rightRotate_64((a), 48))
+#define rightRotate49_64(a) (rightRotate_64((a), 49))
+#define rightRotate50_64(a) (rightRotate_64((a), 50))
+#define rightRotate51_64(a) (rightRotate_64((a), 51))
+#define rightRotate52_64(a) (rightRotate_64((a), 52))
+#define rightRotate53_64(a) (rightRotate_64((a), 53))
+#define rightRotate54_64(a) (rightRotate_64((a), 54))
+#define rightRotate55_64(a) (rightRotate_64((a), 55))
+#define rightRotate56_64(a) (rightRotate_64((a), 56))
+#define rightRotate57_64(a) (rightRotate_64((a), 57))
+#define rightRotate58_64(a) (rightRotate_64((a), 58))
+#define rightRotate59_64(a) (rightRotate_64((a), 59))
+#define rightRotate60_64(a) (rightRotate_64((a), 60))
+#define rightRotate61_64(a) (rightRotate_64((a), 61))
+#define rightRotate62_64(a) (rightRotate_64((a), 62))
+#define rightRotate63_64(a) (rightRotate_64((a), 63))
+
+/* Rotate a 16-bit value left by a number of bits */
+#define leftRotate_16(a, bits) \
+    (__extension__ ({ \
+        uint16_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (16 - (bits))); \
+    }))
+
+/* Rotate a 16-bit value right by a number of bits */
+#define rightRotate_16(a, bits) \
+    (__extension__ ({ \
+        uint16_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (16 - (bits))); \
+    }))
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1_16(a)  (leftRotate_16((a), 1))
+#define leftRotate2_16(a)  (leftRotate_16((a), 2))
+#define leftRotate3_16(a)  (leftRotate_16((a), 3))
+#define leftRotate4_16(a)  (leftRotate_16((a), 4))
+#define leftRotate5_16(a)  (leftRotate_16((a), 5))
+#define leftRotate6_16(a)  (leftRotate_16((a), 6))
+#define leftRotate7_16(a)  (leftRotate_16((a), 7))
+#define leftRotate8_16(a)  (leftRotate_16((a), 8))
+#define leftRotate9_16(a)  (leftRotate_16((a), 9))
+#define leftRotate10_16(a) (leftRotate_16((a), 10))
+#define leftRotate11_16(a) (leftRotate_16((a), 11))
+#define leftRotate12_16(a) (leftRotate_16((a), 12))
+#define leftRotate13_16(a) (leftRotate_16((a), 13))
+#define leftRotate14_16(a) (leftRotate_16((a), 14))
+#define leftRotate15_16(a) (leftRotate_16((a), 15))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1_16(a)  (rightRotate_16((a), 1))
+#define rightRotate2_16(a)  (rightRotate_16((a), 2))
+#define rightRotate3_16(a)  (rightRotate_16((a), 3))
+#define rightRotate4_16(a)  (rightRotate_16((a), 4))
+#define rightRotate5_16(a)  (rightRotate_16((a), 5))
+#define rightRotate6_16(a)  (rightRotate_16((a), 6))
+#define rightRotate7_16(a)  (rightRotate_16((a), 7))
+#define rightRotate8_16(a)  (rightRotate_16((a), 8))
+#define rightRotate9_16(a)  (rightRotate_16((a), 9))
+#define rightRotate10_16(a) (rightRotate_16((a), 10))
+#define rightRotate11_16(a) (rightRotate_16((a), 11))
+#define rightRotate12_16(a) (rightRotate_16((a), 12))
+#define rightRotate13_16(a) (rightRotate_16((a), 13))
+#define rightRotate14_16(a) (rightRotate_16((a), 14))
+#define rightRotate15_16(a) (rightRotate_16((a), 15))
+
+/* Rotate an 8-bit value left by a number of bits */
+#define leftRotate_8(a, bits) \
+    (__extension__ ({ \
+        uint8_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (8 - (bits))); \
+    }))
+
+/* Rotate an 8-bit value right by a number of bits */
+#define rightRotate_8(a, bits) \
+    (__extension__ ({ \
+        uint8_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (8 - (bits))); \
+    }))
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1_8(a)  (leftRotate_8((a), 1))
+#define leftRotate2_8(a)  (leftRotate_8((a), 2))
+#define leftRotate3_8(a)  (leftRotate_8((a), 3))
+#define leftRotate4_8(a)  (leftRotate_8((a), 4))
+#define leftRotate5_8(a)  (leftRotate_8((a), 5))
+#define leftRotate6_8(a)  (leftRotate_8((a), 6))
+#define leftRotate7_8(a)  (leftRotate_8((a), 7))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1_8(a)  (rightRotate_8((a), 1))
+#define rightRotate2_8(a)  (rightRotate_8((a), 2))
+#define rightRotate3_8(a)  (rightRotate_8((a), 3))
+#define rightRotate4_8(a)  (rightRotate_8((a), 4))
+#define rightRotate5_8(a)  (rightRotate_8((a), 5))
+#define rightRotate6_8(a)  (rightRotate_8((a), 6))
+#define rightRotate7_8(a)  (rightRotate_8((a), 7))
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/nistlwc
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/nistlwc
--- a/drygascon/Implementations/crypto_aead/drygascon128/designers
+++ b/drygascon/Implementations/crypto_aead/drygascon128/designers
+Sebastien Riou
--- a/drygascon/Implementations/crypto_aead/drygascon128/ref/implementors
+++ b/drygascon/Implementations/crypto_aead/drygascon128/ref/implementors
+Sebastien Riou
--- a/drygascon/Implementations/crypto_aead/drygascon128/ref/nistlwc
+++ b/drygascon/Implementations/crypto_aead/drygascon128/ref/nistlwc
--- a/knot/Implementations/crypto_aead/knot128v1/armcortexm_1/api.h
+++ b/knot/Implementations/crypto_aead/knot128v1/armcortexm_1/api.h
+#ifndef KNOT_API_H
+#define KNOT_API_H
+//k=n=tag=128  b=256 r=64 c=192
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
+#endif
--- a/knot/Implementations/crypto_aead/knot128v1/armcortexm_1/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot128v1/armcortexm_1/crypto_aead.h
+#ifndef KNOT_CRYPTO_AEAD_H
+#define KNOT_CRYPTO_AEAD_H
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k);
+#endif
--- a/knot/Implementations/crypto_aead/knot128v1/armcortexm_1/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot128v1/armcortexm_1/encrypt.c
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "crypto_aead.h"
+#include "api.h"
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+
+#define KNOT_CIPHER 1
+#if defined(KNOT_CIPHER) && (KNOT_CIPHER == 1)
+unsigned char constant6[63] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x21, 0x03, 0x06,
+	0x0c, 0x18,	0x31, 0x22, 0x05, 0x0a, 0x14, 0x29,
+	0x13, 0x27, 0x0f, 0x1e, 0x3d, 0x3a,	0x34, 0x28,
+	0x11, 0x23, 0x07, 0x0e, 0x1c, 0x39, 0x32, 0x24,
+	0x09, 0x12,	0x25, 0x0b, 0x16, 0x2d, 0x1b, 0x37,
+	0x2e, 0x1d, 0x3b, 0x36, 0x2c, 0x19,	0x33, 0x26,
+	0x0d, 0x1a, 0x35, 0x2a, 0x15, 0x2b, 0x17, 0x2f,
+	0x1f, 0x3f,	0x3e, 0x3c, 0x38, 0x30, 0x20 };
+
+/* State
+ * w4 w0
+ * w5 w1
+ * w6 w2
+ * w7 w3
+ *
+ * Sbox
+	t1  = ~a;
+	t2  = b  & t1;
+	t3  = c  ^ t2; 
+	h   = d  ^ t3; 
+	t5  = b  | c; 
+	t6  = d  ^ t1; 
+	g   = t5 ^ t6; 
+	t8  = b  ^ d; 
+	t9  = t3 & t6; 
+	e   = t8 ^ t9; 
+	t11 = g  & t8; 
+	f   = t3 ^ t11;
+ *
+ * Sbox after change
+	a  = ~a; 
+	s0  = b  & a;
+	s0  = c  ^ s0;
+	c  = b  | c; 
+	a  = d  ^ a; 
+	c   = c ^ a; 
+	s1  = b  ^ d; 
+	d   = d  ^ s0;
+	a  = s0 & a; 
+	a   = s1 ^ a; 
+	b = c  & s1; 
+	b   = s0 ^ b;
+ */
+static void permutation256(unsigned char *in, int rounds, unsigned char *rc) {
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint32_t s0, s1, s2;
+	uint32_t one = 0x1;
+	uint32_t ff = 0xff;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w1,     [in, #8]      \n\t"
+		"ldr     w5,     [in, #12]     \n\t"
+		"ldr     w2,     [in, #16]     \n\t"
+		"ldr     w6,     [in, #20]     \n\t"
+		"ldr     w3,     [in, #24]     \n\t"
+		"ldr     w7,     [in, #28]     \n\t"
+		"mov     s0,     0xfff         \n\t"
+		"mov     s2,     0x1fff        \n\t"
+		"lsl     s2,     s2, #12       \n\t"
+		"eors    s2,     s2, s0        \n\t"
+	"enc_loop:                       \n\t"
+    "/*add round const*/           \n\t"
+		"ldrb    s0,     [rc]          \n\t"
+	  "eors    w0,     w0, s0        \n\t"
+    "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+    "/*rotate shift left 1 bit*/   \n\t"
+		"ror     s0,     w1, #31       \n\t"
+		"ands    s0,     s0, one       \n\t"
+		"lsl     w1,     w1, #1        \n\t"
+		"ror     s1,     w5, #31       \n\t"
+		"ands    s1,     s1, one       \n\t"
+		"eors    w1,     w1, s1        \n\t"
+		"lsl     w5,     w5, #1        \n\t"
+		"eors    w5,     w5, s0        \n\t"
+    "/*rotate shift left 8 bits*/  \n\t"
+		"ror     s0,     w2, #24       \n\t"
+		"ands    s0,     s0, ff        \n\t"
+		"lsl     w2,     w2, #8        \n\t"
+		"ror     s1,     w6, #24       \n\t"
+		"ands    s1,     s1, ff        \n\t"
+		"eors    w2,     w2, s1        \n\t"
+		"lsl     w6,     w6, #8        \n\t"
+		"eors    w6,     w6, s0        \n\t"
+    "/*rotate shift left 25 bits*/ \n\t"
+		"ror     s0,     w3, #7        \n\t"
+		"ands    s0,     s0, s2        \n\t"
+		"lsl     w3,     w3, #25       \n\t"
+		"ror     s1,     w7, #7        \n\t"
+		"ands    s1,     s1, s2        \n\t"
+		"eors    w3,     w3, s1        \n\t"
+		"lsl     w7,     w7, #25       \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc, #1        \n\t"
+		"subs    rounds, rounds,  #1   \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]         \n\t"
+		"str     w4,     [in, #4]     \n\t"
+		"str     w1,     [in, #8]     \n\t"
+		"str     w5,     [in, #12]    \n\t"
+		"str     w2,     [in, #16]    \n\t"
+		"str     w6,     [in, #20]    \n\t"
+		"str     w3,     [in, #24]    \n\t"
+		"str     w7,     [in, #28]    \n\t"
+	);
+}
+
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+												const unsigned char *ad, unsigned long long adlen,
+												const unsigned char *nsec, const unsigned char *npub,
+												const unsigned char *k) {
+	unsigned int u = 0;
+	unsigned int v = 0;
+	unsigned int v1 = 0;
+	unsigned int i;
+	unsigned int last_index = 0;
+	unsigned char *A = NULL;
+	unsigned char *M = NULL;
+	unsigned char S[32];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	// pad associated data
+	if (adlen != 0) {
+		u = (adlen + 8) >> 3;
+		A = malloc(u << 3);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u << 3);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+
+	// pad plaintext data
+	if (mlen != 0) {
+		v = (mlen + 8) >> 3;
+		M = malloc(v << 3);
+		if (M == NULL) {
+			free(A);
+			return -1;
+		}
+		memset(M, 0, v << 3);
+		memcpy(M, m, mlen);
+		M[mlen] = 0x01;
+		M32 = (unsigned int *)M;
+	}
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	permutation256(S, 52, constant6);
+	S32 = (unsigned int *)S;
+ 
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			A32 = A32 + 2;
+			permutation256(S, 28, constant6);
+		}
+	}
+	S[31] ^= 0x80;
+
+	// Encryption processiong plaintext data
+	if (mlen != 0) {
+		C32 = (unsigned int *)c;
+		for (i = 0; i < v - 1; i++) {
+			S32[0] ^= M32[0];
+			S32[1] ^= M32[1];
+			M32 = M32 + 2;
+			C32[0] = S32[0];
+			C32[1] = S32[1];
+			C32 = C32 + 2;
+			permutation256(S, 28, constant6);
+		}
+		v1 = mlen % 8;
+		last_index = (v - 1) << 3;
+		for (i = 0; i < v1; i++) {
+			S[i] ^= M[last_index + i];
+			c[last_index + i] = S[i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation256(S, 32, constant6);
+
+	// return tag
+	memcpy(c + mlen, S, CRYPTO_ABYTES);
+	*clen = mlen + CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	if (M != NULL) {
+		free(M);
+	}
+	return 0;
+}
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k)
+{
+	unsigned int u;
+	unsigned int v = ((clen - CRYPTO_ABYTES) >> 3) + 1;
+	unsigned int v1;
+	unsigned int last_index;
+	unsigned int i;
+	unsigned char *A = NULL;
+	unsigned char S[32];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	*mlen = 0;
+	if (clen < CRYPTO_ABYTES) {
+		return -1;
+	}
+
+	// pad associated data
+	if (adlen != 0) {
+		u = (adlen + 8) >> 3;
+		A = malloc(u << 3);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u << 3);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+	
+	M32 = (unsigned int *)m;
+	C32 = (unsigned int *)c;
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	permutation256(S, 52, constant6);
+	S32 = (unsigned int *)S;
+
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			A32 = A32 + 2;
+			permutation256(S, 28, constant6);
+		}
+	}
+	S[31] ^= 0x80;
+
+	// Encryption processiong 	ciphertext data
+	if (clen != CRYPTO_ABYTES) {
+		C32 = (unsigned int *)c;
+		for (i = 0; i < v - 1; i++) {
+			M32[0] = S32[0] ^ C32[0];
+			M32[1] = S32[1] ^ C32[1];
+			S32[0] = C32[0];
+			S32[1] = C32[1];
+			M32 = M32 + 2;
+			C32 = C32 + 2;
+			permutation256(S, 28, constant6);
+		}
+		v1 = (clen - CRYPTO_ABYTES) % 8;
+		last_index = (v - 1) << 3;
+		for (i = 0; i < v1; i++) {
+			m[last_index + i] = S[i] ^ c[last_index + i];
+			S[i] = c[last_index + i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation256(S, 32, constant6);
+
+	// return -1 if verification fails
+	for (i = 0; i < CRYPTO_ABYTES; i++) {
+		if (c[clen - CRYPTO_ABYTES + i] != S[i]) {
+			memset(m, 0, clen - CRYPTO_ABYTES);
+			return -1;
+		}
+	}
+	*mlen = clen - CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	return 0;
+}
+#else
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+												const unsigned char *ad, unsigned long long adlen,
+												const unsigned char *nsec, const unsigned char *npub,
+												const unsigned char *k) {
+	return 0;
+}
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k) {
+	return 0;
+}
+#endif
+
--- a/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/api.h
+++ b/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/api.h
+//k=n=tag=128  b=256 r=64 c=192
+#define CRYPTO_KEYBYTES 16 //
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
+
+
--- a/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/auxFormat.h
+++ b/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/auxFormat.h
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include"crypto_aead.h"
+#include"api.h"
+#include  <string.h>
+#define U32BIG(x) (x)
+
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+#define LOTR32(x,n) (((x)<<(n))|((x)>>(32-(n))))
+
+
+#define sbox(a, b, c, d, e, f, g, h)                                                                            \
+{                                                                                                                             \
+	t1 = ~a; t2 = b & t1;t3 = c ^ t2; h = d ^ t3; t5 = b | c; t6 = d ^ t1; g = t5 ^ t6; t8 = b ^ d; t9 = t3 & t6; e = t8 ^ t9; t11 = g & t8; f = t3 ^ t11; \
+}
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define packFormat(out,in) {\
+t1 = U32BIG(((u32*)in)[0]);	\
+t2 = U32BIG(((u32*)in)[1]);	\
+t3 = (t1 ^ (t1 >> 1)) & 0x22222222, t1 ^= t3 ^ (t3 << 1);	\
+t3 = (t1 ^ (t1 >> 2)) & 0x0C0C0C0C, t1 ^= t3 ^ (t3 << 2);	\
+t3 = (t1 ^ (t1 >> 4)) & 0x00F000F0, t1 ^= t3 ^ (t3 << 4);	\
+t3 = (t1 ^ (t1 >> 8)) & 0x0000FF00, t1 ^= t3 ^ (t3 << 8);  	\
+t5 = (t2 ^ (t2 >> 1)) & 0x22222222, t2 ^= t5 ^ (t5 << 1);	\
+t5 = (t2 ^ (t2 >> 2)) & 0x0C0C0C0C, t2 ^= t5 ^ (t5 << 2);	\
+t5 = (t2 ^ (t2 >> 4)) & 0x00F000F0, t2 ^= t5 ^ (t5 << 4);	\
+t5 = (t2 ^ (t2 >> 8)) & 0x0000FF00, t2 ^= t5 ^ (t5 << 8);  	\
+out[0] = (t2 & 0xFFFF0000) | (t1 >> 16);                  	\
+out[1] = (t2 << 16) | (t1 & 0x0000FFFF);                	\
+}
+#define unpackFormat(out, in) {\
+		t2 = (in[0] & 0xFFFF0000) | (in[1] >> 16); \
+		t1 = (in[1] & 0x0000FFFF) | (in[0] << 16); \
+		t3 = (t1 ^ (t1 >> 8)) & 0x0000FF00, t1 ^= t3 ^ (t3 << 8); \
+		t3 = (t1 ^ (t1 >> 4)) & 0x00F000F0, t1 ^= t3 ^ (t3 << 4); \
+		t3 = (t1 ^ (t1 >> 2)) & 0x0C0C0C0C, t1 ^= t3 ^ (t3 << 2); \
+		t3 = (t1 ^ (t1 >> 1)) & 0x22222222, t1 ^= t3 ^ (t3 << 1); \
+		t5 = (t2 ^ (t2 >> 8)) & 0x0000FF00, t2 ^= t5 ^ (t5 << 8); \
+		t5 = (t2 ^ (t2 >> 4)) & 0x00F000F0, t2 ^= t5 ^ (t5 << 4); \
+		t5 = (t2 ^ (t2 >> 2)) & 0x0C0C0C0C, t2 ^= t5 ^ (t5 << 2); \
+		t5 = (t2 ^ (t2 >> 1)) & 0x22222222, t2 ^= t5 ^ (t5 << 1); \
+		*((u64*)out) = ((u64)t2 << 32 | t1); \
+}
+#define getU32Format(out,  in) {\
+	  t1, t2 = U32BIG(((u32*)in)[0]);	\
+		t1 = (t2 ^ (t2 >> 1)) & 0x22222222, t2 ^= t1 ^ (t1 << 1);	\
+		t1 = (t2 ^ (t2 >> 2)) & 0x0C0C0C0C, t2 ^= t1 ^ (t1 << 2);	\
+		t1 = (t2 ^ (t2 >> 4)) & 0x00F000F0, t2 ^= t1 ^ (t1 << 4);	\
+		t1 = (t2 ^ (t2 >> 8)) & 0x0000FF00, t2 ^= t1 ^ (t1 << 8);	\
+		*out = t2;	\
+}
+#define ROUND256( constant6Format,lunNum) {\
+	s[0] ^= constant6Format[lunNum]>> 4;\
+	s[1] ^= constant6Format[lunNum]& 0x0f;\
+	sbox(s[0], s[2], s[4], s[6], s_temp[0], s_temp[2], s_temp[4], s_temp[6]);\
+	sbox(s[1], s[3], s[5], s[7], s_temp[1], s_temp[3], s_temp[5], s_temp[7]);\
+	s[0] = s_temp[0];\
+	s[1] = s_temp[1];\
+	s[2] = s_temp[3];\
+	s[3] = LOTR32(s_temp[2], 1);\
+	s[4] = LOTR32(s_temp[4], 4);\
+	s[5] = LOTR32(s_temp[5], 4);\
+	s[6] = LOTR32(s_temp[7], 12);\
+	s[7] = LOTR32(s_temp[6], 13);\
+}
+void printfFormat(char name[], u32 * in);
+void printU8(char name[], u8 var[], long len, int offset);
+
--- a/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
--- a/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot128v1/armcortexm_2/encrypt.c
+
+#include"auxFormat.h"
+
+#define RATE (64 / 8)
+
+#define PR0_ROUNDS 52
+#define PR_ROUNDS 28
+#define PRF_ROUNDS 32
+unsigned char  constant6Format[63] = {
+	/*constant6_aead_128v1:*/
+0x1,
+0x10,
+0x2,
+0x20,
+0x4,
+0x41,
+0x11,
+0x12,
+0x22,
+0x24,
+0x45,
+0x50,
+0x3,
+0x30,
+0x6,
+0x61,
+0x15,
+0x53,
+0x33,
+0x36,
+0x67,
+0x74,
+0x46,
+0x60,
+0x5,
+0x51,
+0x13,
+0x32,
+0x26,
+0x65,
+0x54,
+0x42,
+0x21,
+0x14,
+0x43,
+0x31,
+0x16,
+0x63,
+0x35,
+0x57,
+0x72,
+0x27,
+0x75,
+0x56,
+0x62,
+0x25,
+0x55,
+0x52,
+0x23,
+0x34,
+0x47,
+0x70,
+0x7,
+0x71,
+0x17,
+0x73,
+0x37,
+0x77,
+0x76,
+0x66,
+0x64,
+0x44,
+0x40,
+
+};
+
+
+
+
+static void permutation256(unsigned int *in, int rounds, unsigned char *rc) {
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint32_t s0, s1, s2;
+	uint32_t one = 0x1;
+	uint32_t i=0;
+	uint32_t ff = 0xff;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w1,     [in, #8]      \n\t"
+		"ldr     w5,     [in, #12]     \n\t"
+		"ldr     w2,     [in, #16]     \n\t"
+		"ldr     w6,     [in, #20]     \n\t"
+		"ldr     w3,     [in, #24]     \n\t"
+		"ldr     w7,     [in, #28]     \n\t"	
+	"enc_loop:                       \n\t"
+    "/*add round const   s0 s1*/           \n\t"
+		"ldrb    s0,     [rc]          \n\t"	
+		"LSR     s1,     s0, #4       \n\t"	
+		"and    s0,     s0, 0xf        \n\t"
+	  "eors    w4,     w4, s0        \n\t"
+	  "eors    w0,     w0, s1        \n\t"
+    "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+    "/*rotate shift left 1 bit*/   \n\t"
+		"mov    s0,     w5       \n\t"
+		"ROR    w5,     w1, #31        \n\t"
+		"mov    w1,     s0       \n\t"
+    "/*rotate shift left 8 bits*/  \n\t"
+		"ROR    w2,     w2, #28        \n\t"
+		"ROR    w6,     w6, #28       \n\t"
+    "/*rotate shift left 25 bits*/ \n\t"
+		"mov    s0,     w3       \n\t"
+		"ROR    w3,     w7, #20       \n\t"
+		"ROR    w7,     s0, #19       \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc, #1        \n\t"
+		"subs    rounds, rounds,  #1   \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]         \n\t"
+		"str     w4,     [in, #4]     \n\t"
+		"str     w1,     [in, #8]     \n\t"
+		"str     w5,     [in, #12]    \n\t"
+		"str     w2,     [in, #16]    \n\t"
+		"str     w6,     [in, #20]    \n\t"
+		"str     w3,     [in, #24]    \n\t"
+		"str     w7,     [in, #28]    \n\t"
+	);
+}
+
+
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec, const unsigned char *npub,
+	const unsigned char *k) {
+	unsigned int  i, j;
+	u32 s[8] = { 0 };
+	u32 dataFormat[2] = { 0 };
+	u8 tempData[8];
+	u32 s_temp[8] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+	*clen = mlen + CRYPTO_ABYTES;
+	//initialization
+	packFormat(s, npub);
+	packFormat((s + 2), (npub + 8));
+	packFormat((s + 4), k);
+	packFormat((s + 6), (k + 8));
+	permutation256(s,PR0_ROUNDS,constant6Format);
+	// process associated data
+	if (adlen) {
+		while (adlen >= RATE) {
+			packFormat(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+	permutation256(s,PR_ROUNDS,constant6Format);
+			adlen -= RATE;
+			ad += RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+memcpy(tempData, ad, adlen * sizeof(unsigned char));	
+tempData[adlen] = 0x01;
+		packFormat(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+	permutation256(s,PR_ROUNDS,constant6Format);
+	}
+	s[6] ^= 0x80000000;
+	if (mlen) {
+		while (mlen >= RATE) {
+			packFormat(dataFormat, m);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			unpackFormat(c, s);
+	permutation256(s,PR_ROUNDS,constant6Format);
+			mlen -= RATE;
+			m += RATE;
+			c += RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+memcpy(tempData, m, mlen * sizeof(unsigned char));
+  
+tempData[mlen]= 0x01;
+		packFormat(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		unpackFormat(tempData, s);
+		memcpy(c, tempData, mlen * sizeof(unsigned char));
+		c +=mlen;
+	}
+	// finalization
+	permutation256(s,PRF_ROUNDS,constant6Format);
+	// return tag
+	unpackFormat(tempData, s);
+		memcpy(c, tempData, sizeof(tempData));
+	unpackFormat(tempData,(s + 2));
+		memcpy(c+8, tempData, sizeof(tempData));
+//	unpackFormat((c), s);
+//	unpackFormat((c+8),(s + 2));
+	return 0;
+}
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec, const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub, const unsigned char *k) {
+	u8 i, j;
+	// initialization
+	u32 s[8] = { 0 };
+	u32 dataFormat[4] = { 0 };
+	u32 dataFormat_1[2] = { 0 };
+	u8 tempU8[32] = { 0 };
+	u8 tempData[8];
+	u32 s_temp[8] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+		*mlen = clen - CRYPTO_ABYTES;
+	if (clen < CRYPTO_ABYTES)
+		return -1;
+	//initialization
+	packFormat(s, npub);
+	packFormat((s + 2), (npub + 8));
+	packFormat((s + 4), k);
+	packFormat((s + 6), (k + 8));
+	permutation256(s,PR0_ROUNDS,constant6Format);
+	// process associated data
+	if (adlen) {
+		while (adlen >= RATE) {
+			packFormat(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+	permutation256(s,PR_ROUNDS,constant6Format);
+			adlen -= RATE;
+			ad += RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, ad, adlen * sizeof(unsigned char));
+		tempData[adlen] = 0x01;
+		packFormat(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+	permutation256(s,PR_ROUNDS,constant6Format);
+	}
+	s[6] ^= 0x80000000;
+  // process c
+	clen = clen - CRYPTO_KEYBYTES;
+	if (clen) {
+		while (clen >= RATE) {
+			packFormat(dataFormat, c);
+			dataFormat_1[0] = s[0] ^ dataFormat[0];
+			dataFormat_1[1] = s[1] ^ dataFormat[1];
+			unpackFormat(m, dataFormat_1);
+			s[0] = dataFormat[0];
+			s[1] = dataFormat[1];
+	permutation256(s,PR_ROUNDS,constant6Format);
+			clen -= RATE;
+			m += RATE;
+			c += RATE;
+		}
+		unpackFormat(tempU8, s);
+		for (i = 0; i < clen; ++i, ++m, ++c)
+		{
+			*m = tempU8[i]^ *c;
+			tempU8[i] = *c;
+		}
+		tempU8[i] ^= 0x01;
+		packFormat(s, tempU8);	
+	}
+	// finalization
+	permutation256(s,PRF_ROUNDS,constant6Format);
+	// return tag	
+	packFormat(dataFormat, c);
+	packFormat((dataFormat + 2), (c +8));
+	if (dataFormat[0] != s[0] || dataFormat[1] != s[1] || dataFormat[2] != s[2] || dataFormat[3] != s[3]) {
+		return -1;
+	}
+	return 0;
+}
--- a/knot/Implementations/crypto_aead/knot128v2/armcortexm_1/api.h
+++ b/knot/Implementations/crypto_aead/knot128v2/armcortexm_1/api.h
+#ifndef KNOT_API_H
+#define KNOT_API_H
+//k=n=tag=128  b=384 r=192 c=192
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
+#endif
--- a/knot/Implementations/crypto_aead/knot128v2/armcortexm_1/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot128v2/armcortexm_1/crypto_aead.h
+#ifndef KNOT_CRYPTO_AEAD_H
+#define KNOT_CRYPTO_AEAD_H
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k);
+#endif
--- a/knot/Implementations/crypto_aead/knot128v2/armcortexm_1/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot128v2/armcortexm_1/encrypt.c
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "crypto_aead.h"
+#include "api.h"
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+
+#define KNOT_CIPHER 1
+#if defined(KNOT_CIPHER) && (KNOT_CIPHER == 1)
+unsigned char constant7[127] = {
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x41, 0x03, 0x06,
+		0x0c, 0x18, 0x30, 0x61, 0x42, 0x05, 0x0a, 0x14, 0x28, 0x51, 0x23, 0x47,
+		0x0f, 0x1e, 0x3c, 0x79, 0x72, 0x64, 0x48, 0x11, 0x22, 0x45, 0x0b, 0x16,
+		0x2c, 0x59, 0x33, 0x67, 0x4e, 0x1d, 0x3a, 0x75, 0x6a, 0x54, 0x29, 0x53,
+		0x27, 0x4f, 0x1f, 0x3e, 0x7d, 0x7a, 0x74, 0x68, 0x50, 0x21, 0x43, 0x07,
+		0x0e, 0x1c, 0x38, 0x71, 0x62, 0x44, 0x09, 0x12, 0x24, 0x49, 0x13, 0x26,
+		0x4d, 0x1b, 0x36, 0x6d, 0x5a, 0x35, 0x6b, 0x56, 0x2d, 0x5b, 0x37, 0x6f,
+		0x5e, 0x3d, 0x7b, 0x76, 0x6c, 0x58, 0x31, 0x63, 0x46, 0x0d, 0x1a, 0x34,
+		0x69, 0x52, 0x25, 0x4b, 0x17, 0x2e, 0x5d, 0x3b, 0x77, 0x6e, 0x5c, 0x39,
+		0x73, 0x66, 0x4c, 0x19, 0x32, 0x65, 0x4a, 0x15, 0x2a, 0x55, 0x2b, 0x57,
+		0x2f, 0x5f, 0x3f, 0x7f, 0x7e, 0x7c, 0x78, 0x70, 0x60, 0x40 };
+
+/* State
+ * w8  w4 w0
+ * w9  w5 w1
+ * w10 w6 w2
+ * w11 w7 w3
+ *
+ * Sbox
+	t1  = ~a;
+	t2  = b  & t1;
+	t3  = c  ^ t2; 
+	h   = d  ^ t3; 
+	t5  = b  | c; 
+	t6  = d  ^ t1; 
+	g   = t5 ^ t6; 
+	t8  = b  ^ d; 
+	t9  = t3 & t6; 
+	e   = t8 ^ t9; 
+	t11 = g  & t8; 
+	f   = t3 ^ t11;
+ *
+ * Sbox after change
+	a  = ~a; 
+	s0  = b  & a;
+	s0  = c  ^ s0;
+	c  = b  | c; 
+	a  = d  ^ a; 
+	c   = c ^ a; 
+	s1  = b  ^ d; 
+	d   = d  ^ s0;
+	a  = s0 & a; 
+	a   = s1 ^ a; 
+	b = c  & s1; 
+	b   = s0 ^ b;
+ */
+static void permutation384(unsigned char *in, int rounds, unsigned char *rc) {
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11;
+	uint32_t s0, s1, s2;
+	uint32_t one = 0x1;
+	uint32_t ff = 0xff;
+	uint32_t value;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w8,     [in, #8]      \n\t"
+		"ldr     w1,     [in, #12]     \n\t"
+		"ldr     w5,     [in, #16]     \n\t"
+		"ldr     w9,     [in, #20]     \n\t"
+		"ldr     w2,     [in, #24]     \n\t"
+		"ldr     w6,     [in, #28]     \n\t"
+		"ldr     w10,    [in, #32]     \n\t"
+		"ldr     w3,     [in, #36]     \n\t"
+		"ldr     w7,     [in, #40]     \n\t"
+		"ldr     w11,    [in, #44]     \n\t"
+		"mov     s0,     0xfff         \n\t"
+		"mov     value,  0x7ff         \n\t"
+		"lsl     value,  value, #12    \n\t"
+		"eors    value,  value, s0     \n\t"
+	"enc_loop:                       \n\t"
+    "/*add round const*/           \n\t"
+		"ldrb    s0,     [rc]          \n\t"
+	  "eors    w0,     w0, s0        \n\t"
+    "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+		"/*sbox third column*/         \n\t"
+		"mvns    w8,     w8            \n\t"
+		"ands    s0,     w9,  w8       \n\t"
+		"eors    s0,     w10, s0       \n\t"
+		"orrs    w10,    w9,  w10      \n\t"
+		"eors    w8,     w11, w8       \n\t"
+		"eors    w10,    w10, w8       \n\t"
+		"eors    s1,     w9,  w11      \n\t"
+		"eors    w11,    w11, s0       \n\t"
+		"ands    w8,     s0,  w8       \n\t"
+		"eors    w8,     s1,  w8       \n\t"
+		"ands    w9,     w10, s1       \n\t"
+		"eors    w9,     s0,  w9       \n\t"
+    "/*rotate shift left 1 bit*/   \n\t"
+		"ror     s0,     w1, #31       \n\t"
+		"ands    s0,     s0, one       \n\t"
+		"lsl     w1,     w1, #1        \n\t"
+		"ror     s1,     w9, #31       \n\t"
+		"ands    s1,     s1, one       \n\t"
+		"eors    w1,     w1, s1        \n\t"
+		"ror     s2,     w5, #31       \n\t"
+		"ands    s2,     s2, one       \n\t"
+		"lsl     w5,     w5, #1        \n\t"
+		"eors    w5,     w5, s0        \n\t"
+		"lsl     w9,     w9, #1        \n\t"
+		"eors    w9,     w9, s2        \n\t"
+    "/*rotate shift left 8 bits*/  \n\t"
+		"ror     s0,     w2,  #24      \n\t"
+		"ands    s0,     s0,  ff       \n\t"
+		"lsl     w2,     w2,  #8       \n\t"
+		"ror     s1,     w10, #24      \n\t"
+		"ands    s1,     s1,  ff       \n\t"
+		"eors    w2,     w2,  s1       \n\t"
+		"ror     s2,     w6,  #24      \n\t"
+		"ands    s2,     s2,  ff       \n\t"
+		"lsl     w6,     w6,  #8       \n\t"
+		"eors    w6,     w6,  s0       \n\t"
+		"lsl     w10,    w10, #8       \n\t"
+		"eors    w10,    w10, s2       \n\t"
+    "/*rotate shift left 55 bits*/ \n\t"
+		"ror     s0,     w11, #9       \n\t"
+		"ands    s0,     s0,  value    \n\t"
+		"lsl     w11,    w11, #23      \n\t"
+		"ror     s1,     w7,  #9       \n\t"
+		"ands    s1,     s1,  value    \n\t"
+		"eors    w11,    w11, s1       \n\t"
+		"ror     s2,     w3,  #9       \n\t"
+		"ands    s2,     s2,  value    \n\t"
+		"lsl     w3,     w3,  #23      \n\t"
+		"eors    w3,     w3,  s0       \n\t"
+		"lsl     w7,     w7,  #23      \n\t"
+		"eors    w7,     w7,  s2       \n\t"
+		"mov     s0,     w3            \n\t"
+		"mov     w3,     w11           \n\t"
+		"mov     w11,    w7            \n\t"
+		"mov     w7,     s0            \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc,  #1       \n\t"
+		"subs    rounds, rounds, #1    \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]          \n\t"
+		"str     w4,     [in, #4]      \n\t"
+		"str     w8,     [in, #8]      \n\t"
+		"str     w1,     [in, #12]     \n\t"
+		"str     w5,     [in, #16]     \n\t"
+		"str     w9,     [in, #20]     \n\t"
+		"str     w2,     [in, #24]     \n\t"
+		"str     w6,     [in, #28]     \n\t"
+		"str     w10,    [in, #32]     \n\t"
+		"str     w3,     [in, #36]     \n\t"
+		"str     w7,     [in, #40]     \n\t"
+		"str     w11,    [in, #44]     \n\t"
+	);
+}
+
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k) {
+	unsigned int u = 0;
+	unsigned int v = 0;
+	unsigned int v1 = 0;
+	unsigned int last_index = 0;
+	unsigned int i;
+	unsigned char *A = NULL;
+	unsigned char *M = NULL;
+	unsigned char S[48];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	// pad associated data
+	if (adlen != 0) {
+		u = adlen / 24 + 1;
+		A = malloc(u * 24);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u * 24);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+
+	// pad plaintext data
+	if (mlen != 0) {
+		v = mlen / 24 + 1;
+		M = malloc(v * 24);
+		if (M == NULL) {
+			free(A);
+			return -1;
+		}
+		memset(M, 0, v * 24);
+		memcpy(M, m, mlen);
+		M[mlen] = 0x01;
+		M32 = (unsigned int *)M;
+	}
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	memset(S + CRYPTO_NPUBBYTES + CRYPTO_KEYBYTES, 0, CRYPTO_KEYBYTES);
+	S[47] ^= 0x80;
+	permutation384(S, 76, constant7);
+	S32 = (unsigned int *)S;
+ 
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			S32[2] ^= A32[2];
+			S32[3] ^= A32[3];
+			S32[4] ^= A32[4];
+			S32[5] ^= A32[5];
+			A32 = A32 + 6;
+			permutation384(S, 28, constant7);
+		}
+	}
+	S[47] ^= 0x80;
+
+	// Encryption processiong plaintext data
+	if (mlen != 0) {
+		C32 = (unsigned int *)c;
+		for (i = 0; i < v - 1; i++) {
+			S32[0] ^= M32[0];
+			S32[1] ^= M32[1];
+			S32[2] ^= M32[2];
+			S32[3] ^= M32[3];
+			S32[4] ^= M32[4];
+			S32[5] ^= M32[5];			
+			M32 = M32 + 6;
+			C32[0] = S32[0];
+			C32[1] = S32[1];
+			C32[2] = S32[2];
+			C32[3] = S32[3];
+			C32[4] = S32[4];
+			C32[5] = S32[5];
+			C32 = C32 + 6;
+			permutation384(S, 28, constant7);
+		}
+		v1 = mlen % 24;
+		last_index = (v - 1) * 24;
+		for (i = 0; i < v1; i++) {
+			S[i] ^= M[last_index + i];
+			c[last_index + i] = S[i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation384(S, 32, constant7);
+
+	// return tag
+	memcpy(c + mlen, S, CRYPTO_ABYTES);
+	*clen = mlen + CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	if (M != NULL) {
+		free(M);
+	}
+	return 0;
+}
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k)
+{
+	unsigned int u = 0;
+	unsigned int v = 0;
+	unsigned int v1 = 0;
+	unsigned int last_index = 0;
+	unsigned int i;
+	unsigned char *A = NULL;
+	unsigned char S[48];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	*mlen = 0;
+	if (clen < CRYPTO_ABYTES) {
+		return -1;
+	}
+
+	// pad associated data
+	if (adlen != 0) {
+		u = adlen / 24 + 1;
+		A = malloc(u * 24);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u * 24);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+	
+	M32 = (unsigned int *)m;
+	C32 = (unsigned int *)c;
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	memset(S + CRYPTO_NPUBBYTES + CRYPTO_KEYBYTES, 0, CRYPTO_KEYBYTES);
+	S[47] ^= 0x80;
+	permutation384(S, 76, constant7);
+	S32 = (unsigned int *)S;
+
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			S32[2] ^= A32[2];
+			S32[3] ^= A32[3];
+			S32[4] ^= A32[4];
+			S32[5] ^= A32[5];
+			A32 = A32 + 6;
+			permutation384(S, 28, constant7);
+		}
+	}
+	S[47] ^= 0x80;
+
+	// Encryption processiong 	ciphertext data
+	if (clen != CRYPTO_ABYTES) {
+		C32 = (unsigned int *)c;
+		v = (clen - CRYPTO_ABYTES) / 24  + 1;
+		for (i = 0; i < v - 1; i++) {
+			M32[0] = S32[0] ^ C32[0];
+			M32[1] = S32[1] ^ C32[1];
+			M32[2] = S32[2] ^ C32[2];
+			M32[3] = S32[3] ^ C32[3];
+			M32[4] = S32[4] ^ C32[4];
+			M32[5] = S32[5] ^ C32[5];
+			S32[0] = C32[0];
+			S32[1] = C32[1];
+			S32[2] = C32[2];
+			S32[3] = C32[3];
+			S32[4] = C32[4];
+			S32[5] = C32[5];
+			M32 = M32 + 6;
+			C32 = C32 + 6;
+			permutation384(S, 28, constant7);
+		}
+		v1 = (clen - CRYPTO_ABYTES) % 24;
+		last_index = (v - 1) * 24;
+		for (i = 0; i < v1; i++) {
+			m[last_index + i] = S[i] ^ c[last_index + i];
+			S[i] = c[last_index + i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation384(S, 32, constant7);
+
+	// return -1 if verification fails
+	for (i = 0; i < CRYPTO_ABYTES; i++) {
+		if (c[clen - CRYPTO_ABYTES + i] != S[i]) {
+			memset(m, 0, clen - CRYPTO_ABYTES);
+			return -1;
+		}
+	}
+	*mlen = clen - CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	return 0;
+}
+#else
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+												const unsigned char *ad, unsigned long long adlen,
+												const unsigned char *nsec, const unsigned char *npub,
+												const unsigned char *k) {
+	return 0;
+}
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k) {
+	return 0;
+}
+#endif
--- a/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/api.h
+++ b/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
+
+
--- a/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/auxFormat.h
+++ b/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/auxFormat.h
+//#include<malloc.h>
+#include"crypto_aead.h"
+#include"api.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#define U32BIG(x) (x)
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+#define LOTR32(x,n) (((x)<<(n))|((x)>>(32-(n))))
+
+//////////////////puck begin
+//&:5   <<:4    |:4
+#define puckU32ToThree(x){\
+x &= 0x92492492;\
+x = (x | (x << 2)) & 0xc30c30c3;\
+x = (x | (x << 4)) & 0xf00f00f0;\
+x = (x | (x << 8)) & 0xff0000ff;\
+x = (x | (x << 16)) & 0xfff00000;\
+}
+#define unpuckU32ToThree(x){\
+x &= 0xfff00000;\
+x = (x | (x >> 16)) & 0xff0000ff;\
+x = (x | (x >> 8)) & 0xf00f00f0;\
+x = (x | (x >> 4)) & 0xc30c30c3;\
+x = (x | (x >> 2)) & 0x92492492;\
+}
+//ʹ	u8  t2_64, t2_65;u32 temp2[3];t2;
+#define packU32FormatToThreePacket( out,  in) {\
+t2 = U32BIG(((u32*)in)[0]);	\
+t2_64 = (in[3] & 0x80) >> 7, t2_65 = (in[3] & 0x40) >> 6;	\
+t2 = t2 << 2;	\
+temp2[0] = t2; temp2[1] = t2 << 1; temp2[2] = t2 << 2;	\
+puckU32ToThree(temp2[0]);	\
+puckU32ToThree(temp2[1]);	\
+puckU32ToThree(temp2[2]);	\
+out[0] = (temp2[0] >> 22);	\
+out[1] = (((u32)t2_64) << 10) | (temp2[1] >> 22);	\
+out[2] =(((u32)t2_65) << 10) | (temp2[2] >> 22);	\
+}
+//t9  t1  t2  t1_32  t2_64  t2_65 temp0[3] temp1[3] temp2[3]
+#define packU96FormatToThreePacket(out, in) {\
+t9 = U32BIG(((u32*)in)[2]);	\
+t1 = U32BIG(((u32*)in)[1]);	\
+t2 = U32BIG(((u32*)in)[0]);	\
+t1_32 = (in[7] & 0x80) >> 7, t2_64 = (in[3] & 0x80) >> 7, t2_65 = (in[3] & 0x40) >> 6;	\
+t1 = t1 << 1;	\
+t2 = t2 << 2;	\
+temp0[0] = t9; temp0[1] = t9 << 1; temp0[2] = t9 << 2;	\
+puckU32ToThree(temp0[0]);	\
+puckU32ToThree(temp0[1]);	\
+puckU32ToThree(temp0[2]);	\
+temp1[0] = t1; temp1[1] = t1 << 1; temp1[2] = t1 << 2;	\
+puckU32ToThree(temp1[0]);	\
+puckU32ToThree(temp1[1]);	\
+puckU32ToThree(temp1[2]);	\
+temp2[0] = t2; temp2[1] = t2 << 1; temp2[2] = t2 << 2;	\
+puckU32ToThree(temp2[0]);	\
+puckU32ToThree(temp2[1]);	\
+puckU32ToThree(temp2[2]);	\
+out[0] = (temp0[0]) | (temp1[0] >> 11) | (temp2[0] >> 22);	\
+out[1] = (temp0[1]) | (temp1[1] >> 11) | (((u32)t2_64) << 10) | (temp2[1] >> 22);	\
+out[2] = (temp0[2]) | (((u32)t1_32) << 21) | (temp1[2] >> 11) | (((u32)t2_65) << 10) | (temp2[2] >> 22);	\
+}
+	//ʹ	u8  t2_64, t2_65;u32 temp2[3];t2;
+#define unpackU32FormatToThreePacket(out, in) {\
+temp2[0] = (in[0] & 0x000003ff) << 22;	\
+t2_64 = ((in[1] & 0x00000400) << 21);	\
+temp2[1] = (in[1] & 0x000003ff) << 22;	\
+t2_65 = ((in[2] & 0x00000400) << 20);	\
+temp2[2] = (in[2] & 0x000003ff) << 22;	\
+unpuckU32ToThree(temp2[0]);	\
+unpuckU32ToThree(temp2[1]);	\
+unpuckU32ToThree(temp2[2]);	\
+t2 = t2_65 | t2_64 | ((temp2[0] | temp2[1] >> 1 | temp2[2] >> 2) >> 2);	\
+*(u32*)(out) = U32BIG(t2);	\
+}
+//u32 temp0[3] = { 0 };u32 temp1[3] = { 0 };u32 temp2[3] = { 0 };u32 t1_32, t2_64, t2_65;t9,t1,t2,
+#define unpackU96FormatToThreePacket( out, in) {\
+temp0[0] = in[0] & 0xffe00000;	\
+temp1[0] = (in[0] & 0x001ffc00) << 11;	\
+temp2[0] = (in[0] & 0x000003ff) << 22;	\
+temp0[1] = in[1] & 0xffe00000;	\
+temp1[1] = (in[1] & 0x001ff800) << 11;	\
+t2_64 = ((in[1] & 0x00000400) << 21);	\
+temp2[1] = (in[1] & 0x000003ff) << 22;	\
+temp0[2] = in[2] & 0xffc00000;	\
+t1_32 = ((in[2] & 0x00200000) << 10);	\
+temp1[2] = (in[2] & 0x001ff800) << 11;	\
+t2_65 = ((in[2] & 0x00000400) << 20);	\
+temp2[2] = (in[2] & 0x000003ff) << 22;	\
+unpuckU32ToThree(temp0[0]);	\
+unpuckU32ToThree(temp0[1]);	\
+unpuckU32ToThree(temp0[2]);	\
+t9 = temp0[0] | temp0[1] >> 1 | temp0[2] >> 2;	\
+unpuckU32ToThree(temp1[0]);	\
+unpuckU32ToThree(temp1[1]);	\
+unpuckU32ToThree(temp1[2]);	\
+t1 = t1_32 | ((temp1[0] | temp1[1] >> 1 | temp1[2] >> 2) >> 1);	\
+unpuckU32ToThree(temp2[0]);	\
+unpuckU32ToThree(temp2[1]);	\
+unpuckU32ToThree(temp2[2]);	\
+t2 = t2_65 | t2_64 | ((temp2[0] | temp2[1] >> 1 | temp2[2] >> 2) >> 2);	\
+*(u32*)(out) = U32BIG(t2);	\
+*(u32*)(out + 4) = U32BIG(t1);	\
+*(u32*)(out + 8) = U32BIG(t9);	\
+}
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+#define sbox(a, b, c, d, e, f, g, h)                                                                            \
+{                                                                                                                             \
+	t1 = ~a; t2 = b & t1;t3 = c ^ t2; h = d ^ t3; t5 = b | c; t6 = d ^ t1; g = t5 ^ t6; t8 = b ^ d; t9 = t3 & t6; e = t8 ^ t9; t11 = g & t8; f = t3 ^ t11; \
+}
+
+
+#define U96_BIT_LOTR32_1(t0,t1,t2,t3,t4,t5){\
+t3= t1;\
+t4 = t2;\
+t5 = LOTR32(t0, 1); \
+}
+#define U96_BIT_LOTR32_8(t0,t1,t2,t3,t4,t5){\
+t3= LOTR32(t2, 2);\
+t4 =LOTR32(t0, 3);\
+t5 = LOTR32(t1, 3); \
+}
+//55=3*18+1
+#define U96_BIT_LOTR32_55(t0,t1,t2,t3,t4,t5){\
+t3= LOTR32(t1, 18); \
+t4 = LOTR32(t2, 18);\
+t5 = LOTR32(t0, 19); \
+}
+/*
+s0  s1  s2
+s3  s4  s5
+s6  s7  s8
+s9 s10 s11
+*/
+
+void printU32State(char name[], u32* var, long len);
+void printfU96Format(char name[], u32 * s);
+//////////////////puck end
+void printU8(char name[], u8 var[], int len, int offset);
+void printfU96Format(char name[], u32 * s);
--- a/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/crypto_aead.h
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
--- a/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot128v2/armcortexm_2/encrypt.c
+
+#include"auxFormat.h"
+
+#define aead_RATE (192 / 8)
+#define PR0_ROUNDS 76
+#define PR_ROUNDS 28
+#define PRF_ROUNDS 32
+
+unsigned char  constant7Format[127] = {
+	/*constant7Format[127]:*/
+	0x01,0x08,0x40,0x02,0x10,0x80,0x05,0x09,0x48,0x42,0x12,0x90,
+	0x85,0x0c,0x41,0x0a,0x50,0x82,0x15,0x89,0x4d,0x4b,0x5a,0xd2,
+	0x97,0x9c,0xc4,0x06,0x11,0x88,0x45,0x0b,0x58,0xc2,0x17,0x99,
+	0xcd,0x4e,0x53,0x9a,0xd5,0x8e,0x54,0x83,0x1d,0xc9,0x4f,0x5b,
+	0xda,0xd7,0x9e,0xd4,0x86,0x14,0x81,0x0d,0x49,0x4a,0x52,0x92,
+	0x95,0x8c,0x44,0x03,0x18,0xc0,0x07,0x19,0xc8,0x47,0x1b,0xd8,
+	0xc7,0x1e,0xd1,0x8f,0x5c,0xc3,0x1f,0xd9,0xcf,0x5e,0xd3,0x9f,
+	0xdc,0xc6,0x16,0x91,0x8d,0x4c,0x43,0x1a,0xd0,0x87,0x1c,0xc1,
+	0x0f,0x59,0xca,0x57,0x9b,0xdd,0xce,0x56,0x93,0x9d,0xcc,0x46,
+	0x13,0x98,0xc5,0x0e,0x51,0x8a,0x55,0x8b,0x5d,0xcb,0x5f,0xdb,
+	0xdf,0xde,0xd6,0x96,0x94,0x84,0x04, };
+/* State
+ * w8  w4 w0
+ * w9  w5 w1
+ * w10 w6 w2
+ * w11 w7 w3
+ */
+ static void permutation384(unsigned int *in, int rounds, unsigned char *rc) {
+
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11;
+	uint32_t s0, s1, s2;
+	uint32_t i=0;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w8,     [in, #8]      \n\t"
+		"ldr     w1,     [in, #12]     \n\t"
+		"ldr     w5,     [in, #16]     \n\t"
+		"ldr     w9,     [in, #20]     \n\t"
+		"ldr     w2,     [in, #24]     \n\t"
+		"ldr     w6,     [in, #28]     \n\t"
+		"ldr     w10,    [in, #32]     \n\t"
+		"ldr     w3,     [in, #36]     \n\t"
+		"ldr     w7,     [in, #40]     \n\t"
+		"ldr     w11,    [in, #44]     \n\t"	
+	"enc_loop:                       \n\t"
+    "/*add round const   s0 s1*/           \n\t"
+		"ldrb    s0,     [rc]          \n\t"	
+		"LSR     s1,     s0, #6       \n\t"	
+		"and    s1,     s1, 0x3        \n\t"
+		"LSR     s2,     s0, #3       \n\t"	
+		"and    s2,     s2, 0x7        \n\t"
+		"and    s0,     s0, 0x7        \n\t"
+	  "eors    w8,     w8, s0        \n\t"
+	  "eors    w4,     w4, s2        \n\t"
+	  "eors    w0,     w0, s1        \n\t"
+    "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+		"/*sbox third column*/         \n\t"
+		"mvns    w8,     w8            \n\t"
+		"ands    s0,     w9,  w8       \n\t"
+		"eors    s0,     w10, s0       \n\t"
+		"orrs    w10,    w9,  w10      \n\t"
+		"eors    w8,     w11, w8       \n\t"
+		"eors    w10,    w10, w8       \n\t"
+		"eors    s1,     w9,  w11      \n\t"
+		"eors    w11,    w11, s0       \n\t"
+		"ands    w8,     s0,  w8       \n\t"
+		"eors    w8,     s1,  w8       \n\t"
+		"ands    w9,     w10, s1       \n\t"
+		"eors    w9,     s0,  w9       \n\t"
+    "/*rotate shift left 1 bit  [w9 w5 w1-> (w1,1) w9 w5] */   \n\t"
+		"mov    s0,     w1       \n\t"
+		"mov    w1,     w5       \n\t"
+		"mov    w5,     w9       \n\t"
+		"ROR    w9,     s0, #31        \n\t"
+    "/*rotate shift left 8 bits [w10 w6 w2-> w6,3)  (w2,3)  ( w10,2)]*/  \n\t"
+		"mov    s0,     w10       \n\t"
+		"ROR    w10,    w6 , #29      \n\t"
+		"ROR    w6,     w2  , #29      \n\t"
+		"ROR    w2,     s0, #30        \n\t"
+    "/*rotate shift left 55 bit  [w11 w7 w3-> w3,13)  (w11,14)  ( w7,14)] */   \n\t"
+		"mov    s0,     w3       \n\t"
+		"ROR    w3,     w7 , #14      \n\t"
+		"ROR    w7,     w11 , #14      \n\t"
+		"ROR    w11,     s0, #13        \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc,  #1       \n\t"
+		"subs    rounds, rounds, #1    \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]          \n\t"
+		"str     w4,     [in, #4]      \n\t"
+		"str     w8,     [in, #8]      \n\t"
+		"str     w1,     [in, #12]     \n\t"
+		"str     w5,     [in, #16]     \n\t"
+		"str     w9,     [in, #20]     \n\t"
+		"str     w2,     [in, #24]     \n\t"
+		"str     w6,     [in, #28]     \n\t"
+		"str     w10,    [in, #32]     \n\t"
+		"str     w3,     [in, #36]     \n\t"
+		"str     w7,     [in, #40]     \n\t"
+		"str     w11,    [in, #44]     \n\t"
+	);
+}
+
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec, const unsigned char *npub,
+	const unsigned char *k) {
+	u8 i; 
+	u32 s[12] = { 0 };
+	u8 tempData[24] = { 0 };
+	u32 dataFormat[6] = { 0 };
+	u32 s_temp[12] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+	u32 t1_32, t2_64, t2_65;
+	u32 temp0[3] = { 0 };
+	u32 temp1[3] = { 0 };
+	u32 temp2[3] = { 0 }; 
+
+	*clen = mlen + CRYPTO_ABYTES;
+	// initialization
+	packU96FormatToThreePacket(s, npub);
+	memcpy(tempData, npub+12, sizeof(unsigned char)*4);
+	memcpy(tempData+4, k, sizeof(unsigned char) * 16);
+	packU96FormatToThreePacket((s + 3), tempData);
+	packU96FormatToThreePacket((s + 6), (tempData+12));
+
+	s[9] = 0x80000000;
+	permutation384(s,PR0_ROUNDS,constant7Format);
+	// process associated data
+	if (adlen) {
+		//	rlen = adlen;
+		while (adlen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+			packU96FormatToThreePacket((dataFormat+3), (ad+12));
+			s[3] ^= dataFormat[3];
+			s[4] ^= dataFormat[4];
+			s[5] ^= dataFormat[5];
+	permutation384(s,PR_ROUNDS,constant7Format);
+			adlen -= aead_RATE;
+			ad += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, ad, adlen * sizeof(unsigned char));
+		tempData[adlen] = 0x01;
+		packU96FormatToThreePacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		packU96FormatToThreePacket((dataFormat + 3), (tempData + 12));
+		s[3] ^= dataFormat[3];
+		s[4] ^= dataFormat[4];
+		s[5] ^= dataFormat[5];
+	permutation384(s,PR_ROUNDS,constant7Format);
+	}
+	s[9] ^= 0x80000000;
+	if (mlen) {
+		while (mlen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, m);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+			packU96FormatToThreePacket((dataFormat + 3), (m + 12));
+			s[3] ^= dataFormat[3];
+			s[4] ^= dataFormat[4];
+			s[5] ^= dataFormat[5];
+			unpackU96FormatToThreePacket(c, s);
+			unpackU96FormatToThreePacket((c+12), (s+3));
+	permutation384(s,PR_ROUNDS,constant7Format);
+			mlen -= aead_RATE;
+			m += aead_RATE;
+			c += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, m, mlen * sizeof(unsigned char));
+    tempData[mlen]= 0x01;
+		packU96FormatToThreePacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		packU96FormatToThreePacket((dataFormat + 3), (tempData + 12));
+		s[3] ^= dataFormat[3];
+		s[4] ^= dataFormat[4];
+		s[5] ^= dataFormat[5];
+		unpackU96FormatToThreePacket(tempData, s);
+		unpackU96FormatToThreePacket((tempData+12), (s+3));
+    memcpy(c, tempData, mlen * sizeof(unsigned char));
+		c += mlen;
+	}
+	// finalization
+	permutation384(s,PRF_ROUNDS,constant7Format);
+	// return tag
+	unpackU96FormatToThreePacket(c, s);
+	unpackU96FormatToThreePacket(tempData, (s + 3));
+	memcpy(c+12, tempData, sizeof(unsigned char) * 4);
+	return 0;
+}
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec, const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub, const unsigned char *k) {
+
+	u8 i, j;
+	u32 s[12] = { 0 };
+	u32 s_temp[12] = { 0 };
+	u32 dataFormat[12] = { 0 };
+	u32 dataFormat_1[12] = { 0 };
+	u8 tempData[24] = { 0 };
+	u8 tempU8[24] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+	u32 t1_32, t2_64, t2_65;
+	u32 temp0[3] = { 0 };
+	u32 temp1[3] = { 0 };
+	u32 temp2[3] = { 0 };	*mlen = clen - CRYPTO_ABYTES;
+	if (clen < CRYPTO_ABYTES)
+		return -1;
+	// initialization
+	packU96FormatToThreePacket(s, npub);
+	memcpy(tempData, npub + 12, sizeof(unsigned char) * 4);
+	memcpy(tempData + 4, k, sizeof(unsigned char) * 16);
+	packU96FormatToThreePacket((s + 3), tempData);
+	packU96FormatToThreePacket((s + 6), (tempData + 12));
+
+	s[9] = 0x80000000;
+	permutation384(s,PR0_ROUNDS,constant7Format);
+	// process associated data
+	if (adlen) {
+		while (adlen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+			packU96FormatToThreePacket((dataFormat + 3), (ad + 12));
+			s[3] ^= dataFormat[3];
+			s[4] ^= dataFormat[4];
+			s[5] ^= dataFormat[5];
+	permutation384(s,PR_ROUNDS,constant7Format);
+			adlen -= aead_RATE;
+			ad += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, ad, adlen * sizeof(unsigned char));
+		tempData[adlen] = 0x01;
+		packU96FormatToThreePacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		packU96FormatToThreePacket((dataFormat + 3), (tempData + 12));
+		s[3] ^= dataFormat[3];
+		s[4] ^= dataFormat[4];
+		s[5] ^= dataFormat[5];
+	permutation384(s,PR_ROUNDS,constant7Format);
+	}
+	s[9] ^= 0x80000000;
+	clen -= CRYPTO_ABYTES;
+	if (clen) {
+		while (clen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, c);
+			dataFormat_1[0] = s[0] ^ dataFormat[0];
+			dataFormat_1[1] = s[1] ^ dataFormat[1];
+			dataFormat_1[2] = s[2] ^ dataFormat[2];
+			packU96FormatToThreePacket((dataFormat+3), (c+12));
+			dataFormat_1[3] = s[3] ^ dataFormat[3];
+			dataFormat_1[4] = s[4] ^ dataFormat[4];
+			dataFormat_1[5] = s[5] ^ dataFormat[5];
+			unpackU96FormatToThreePacket(m, dataFormat_1);
+			unpackU96FormatToThreePacket((m + 12), (dataFormat_1 + 3));
+			s[0] = dataFormat[0];
+			s[1] = dataFormat[1];
+			s[2] = dataFormat[2];
+			s[3] = dataFormat[3];
+			s[4] = dataFormat[4];
+			s[5] = dataFormat[5];
+	permutation384(s,PR_ROUNDS,constant7Format);
+			clen -= aead_RATE;
+			m += aead_RATE;
+			c += aead_RATE;
+		}
+		unpackU96FormatToThreePacket(tempU8, s);
+		unpackU96FormatToThreePacket((tempU8+12), (s+3));
+		for (i = 0; i < clen; ++i, ++m, ++c)
+		{
+			*m = tempU8[i] ^ *c;
+			tempU8[i] = *c;
+		}
+		tempU8[i] ^= 0x01;
+		packU96FormatToThreePacket(s, tempU8);
+		packU96FormatToThreePacket((s + 3), (tempU8 + 12));
+	}
+	// finalization		
+	permutation384(s,PRF_ROUNDS,constant7Format);
+	// return tag	
+
+	unpackU96FormatToThreePacket(tempU8, s);
+	unpackU96FormatToThreePacket((tempU8+12), (s+3));
+	if (U32BIG(((u32*)tempU8)[0]) != U32BIG(((u32*)c)[0]) ||
+		U32BIG(((u32*)tempU8)[1]) != U32BIG(((u32*)c)[1]) || 
+		U32BIG(((u32*)tempU8)[2]) != U32BIG(((u32*)c)[2]) || 
+		U32BIG(((u32*)tempU8)[3]) != U32BIG(((u32*)c)[3]) ){
+		return -1;
+	}
+	return 0;
+}
--- a/knot/Implementations/crypto_aead/knot192/armcortexm_1/api.h
+++ b/knot/Implementations/crypto_aead/knot192/armcortexm_1/api.h
+#ifndef KNOT_API_H
+#define KNOT_API_H
+//k=n=tag=128  b=384 r=96 c=288
+#define CRYPTO_KEYBYTES 24
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 24
+#define CRYPTO_ABYTES 24
+#define CRYPTO_NOOVERLAP 1
+#endif
--- a/knot/Implementations/crypto_aead/knot192/armcortexm_1/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot192/armcortexm_1/crypto_aead.h
+#ifndef KNOT_CRYPTO_AEAD_H
+#define KNOT_CRYPTO_AEAD_H
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k);
+#endif
--- a/knot/Implementations/crypto_aead/knot192/armcortexm_1/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot192/armcortexm_1/encrypt.c
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "crypto_aead.h"
+#include "api.h"
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+
+#define KNOT_CIPHER 1
+#if defined(KNOT_CIPHER) && (KNOT_CIPHER == 1)
+unsigned char constant7[127] = {
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x41, 0x03, 0x06,
+		0x0c, 0x18, 0x30, 0x61, 0x42, 0x05, 0x0a, 0x14, 0x28, 0x51, 0x23, 0x47,
+		0x0f, 0x1e, 0x3c, 0x79, 0x72, 0x64, 0x48, 0x11, 0x22, 0x45, 0x0b, 0x16,
+		0x2c, 0x59, 0x33, 0x67, 0x4e, 0x1d, 0x3a, 0x75, 0x6a, 0x54, 0x29, 0x53,
+		0x27, 0x4f, 0x1f, 0x3e, 0x7d, 0x7a, 0x74, 0x68, 0x50, 0x21, 0x43, 0x07,
+		0x0e, 0x1c, 0x38, 0x71, 0x62, 0x44, 0x09, 0x12, 0x24, 0x49, 0x13, 0x26,
+		0x4d, 0x1b, 0x36, 0x6d, 0x5a, 0x35, 0x6b, 0x56, 0x2d, 0x5b, 0x37, 0x6f,
+		0x5e, 0x3d, 0x7b, 0x76, 0x6c, 0x58, 0x31, 0x63, 0x46, 0x0d, 0x1a, 0x34,
+		0x69, 0x52, 0x25, 0x4b, 0x17, 0x2e, 0x5d, 0x3b, 0x77, 0x6e, 0x5c, 0x39,
+		0x73, 0x66, 0x4c, 0x19, 0x32, 0x65, 0x4a, 0x15, 0x2a, 0x55, 0x2b, 0x57,
+		0x2f, 0x5f, 0x3f, 0x7f, 0x7e, 0x7c, 0x78, 0x70, 0x60, 0x40 };
+
+/* State
+ * w8  w4 w0
+ * w9  w5 w1
+ * w10 w6 w2
+ * w11 w7 w3
+ *
+ * Sbox
+	t1  = ~a;
+	t2  = b  & t1;
+	t3  = c  ^ t2; 
+	h   = d  ^ t3; 
+	t5  = b  | c; 
+	t6  = d  ^ t1; 
+	g   = t5 ^ t6; 
+	t8  = b  ^ d; 
+	t9  = t3 & t6; 
+	e   = t8 ^ t9; 
+	t11 = g  & t8; 
+	f   = t3 ^ t11;
+ *
+ * Sbox after change
+	a  = ~a; 
+	s0  = b  & a;
+	s0  = c  ^ s0;
+	c  = b  | c; 
+	a  = d  ^ a; 
+	c   = c ^ a; 
+	s1  = b  ^ d; 
+	d   = d  ^ s0;
+	a  = s0 & a; 
+	a   = s1 ^ a; 
+	b = c  & s1; 
+	b   = s0 ^ b;
+ */
+static void permutation384(unsigned char *in, int rounds, unsigned char *rc) {
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11;
+	uint32_t s0, s1, s2;
+	uint32_t one = 0x1;
+	uint32_t ff = 0xff;
+	uint32_t value;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w8,     [in, #8]      \n\t"
+		"ldr     w1,     [in, #12]     \n\t"
+		"ldr     w5,     [in, #16]     \n\t"
+		"ldr     w9,     [in, #20]     \n\t"
+		"ldr     w2,     [in, #24]     \n\t"
+		"ldr     w6,     [in, #28]     \n\t"
+		"ldr     w10,    [in, #32]     \n\t"
+		"ldr     w3,     [in, #36]     \n\t"
+		"ldr     w7,     [in, #40]     \n\t"
+		"ldr     w11,    [in, #44]     \n\t"
+		"mov     s0,     0xfff         \n\t"
+		"mov     value,  0x7ff         \n\t"
+		"lsl     value,  value, #12    \n\t"
+		"eors    value,  value, s0     \n\t"
+	"enc_loop:                       \n\t"
+    "/*add round const*/           \n\t"
+		"ldrb    s0,     [rc]          \n\t"
+	  "eors    w0,     w0, s0        \n\t"
+    "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+		"/*sbox third column*/         \n\t"
+		"mvns    w8,     w8            \n\t"
+		"ands    s0,     w9,  w8       \n\t"
+		"eors    s0,     w10, s0       \n\t"
+		"orrs    w10,    w9,  w10      \n\t"
+		"eors    w8,     w11, w8       \n\t"
+		"eors    w10,    w10, w8       \n\t"
+		"eors    s1,     w9,  w11      \n\t"
+		"eors    w11,    w11, s0       \n\t"
+		"ands    w8,     s0,  w8       \n\t"
+		"eors    w8,     s1,  w8       \n\t"
+		"ands    w9,     w10, s1       \n\t"
+		"eors    w9,     s0,  w9       \n\t"
+    "/*rotate shift left 1 bit*/   \n\t"
+		"ror     s0,     w1, #31       \n\t"
+		"ands    s0,     s0, one       \n\t"
+		"lsl     w1,     w1, #1        \n\t"
+		"ror     s1,     w9, #31       \n\t"
+		"ands    s1,     s1, one       \n\t"
+		"eors    w1,     w1, s1        \n\t"
+		"ror     s2,     w5, #31       \n\t"
+		"ands    s2,     s2, one       \n\t"
+		"lsl     w5,     w5, #1        \n\t"
+		"eors    w5,     w5, s0        \n\t"
+		"lsl     w9,     w9, #1        \n\t"
+		"eors    w9,     w9, s2        \n\t"
+    "/*rotate shift left 8 bits*/  \n\t"
+		"ror     s0,     w2,  #24      \n\t"
+		"ands    s0,     s0,  ff       \n\t"
+		"lsl     w2,     w2,  #8       \n\t"
+		"ror     s1,     w10, #24      \n\t"
+		"ands    s1,     s1,  ff       \n\t"
+		"eors    w2,     w2,  s1       \n\t"
+		"ror     s2,     w6,  #24      \n\t"
+		"ands    s2,     s2,  ff       \n\t"
+		"lsl     w6,     w6,  #8       \n\t"
+		"eors    w6,     w6,  s0       \n\t"
+		"lsl     w10,    w10, #8       \n\t"
+		"eors    w10,    w10, s2       \n\t"
+    "/*rotate shift left 55 bits*/ \n\t"
+		"ror     s0,     w11, #9       \n\t"
+		"ands    s0,     s0,  value    \n\t"
+		"lsl     w11,    w11, #23      \n\t"
+		"ror     s1,     w7,  #9       \n\t"
+		"ands    s1,     s1,  value    \n\t"
+		"eors    w11,    w11, s1       \n\t"
+		"ror     s2,     w3,  #9       \n\t"
+		"ands    s2,     s2,  value    \n\t"
+		"lsl     w3,     w3,  #23      \n\t"
+		"eors    w3,     w3,  s0       \n\t"
+		"lsl     w7,     w7,  #23      \n\t"
+		"eors    w7,     w7,  s2       \n\t"
+		"mov     s0,     w3            \n\t"
+		"mov     w3,     w11           \n\t"
+		"mov     w11,    w7            \n\t"
+		"mov     w7,     s0            \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc,  #1       \n\t"
+		"subs    rounds, rounds, #1    \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]          \n\t"
+		"str     w4,     [in, #4]      \n\t"
+		"str     w8,     [in, #8]      \n\t"
+		"str     w1,     [in, #12]     \n\t"
+		"str     w5,     [in, #16]     \n\t"
+		"str     w9,     [in, #20]     \n\t"
+		"str     w2,     [in, #24]     \n\t"
+		"str     w6,     [in, #28]     \n\t"
+		"str     w10,    [in, #32]     \n\t"
+		"str     w3,     [in, #36]     \n\t"
+		"str     w7,     [in, #40]     \n\t"
+		"str     w11,    [in, #44]     \n\t"
+	);
+}
+
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k) {
+	unsigned int u = 0;
+	unsigned int v = 0;
+	unsigned int v1 = 0;
+	unsigned int last_index = 0;
+	unsigned int i;
+	unsigned char *A = NULL;
+	unsigned char *M = NULL;
+	unsigned char S[48];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	// pad associated data
+	if (adlen != 0) {
+		u = adlen / 12 + 1;
+		A = malloc(u * 12);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u * 12);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+
+	// pad plaintext data
+	if (mlen != 0) {
+		v = mlen / 12 + 1;
+		M = malloc(v * 12);
+		if (M == NULL) {
+			free(A);
+			return -1;
+		}
+		memset(M, 0, v * 12);
+		memcpy(M, m, mlen);
+		M[mlen] = 0x01;
+		M32 = (unsigned int *)M;
+	}
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	permutation384(S, 76, constant7);
+	S32 = (unsigned int *)S;
+ 
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			S32[2] ^= A32[2];
+			A32 = A32 + 3;
+			permutation384(S, 40, constant7);
+		}
+	}
+	S[47] ^= 0x80;
+
+	// Encryption processiong plaintext data
+	if (mlen != 0) {
+		C32 = (unsigned int *)c;
+		for (i = 0; i < v - 1; i++) {
+			S32[0] ^= M32[0];
+			S32[1] ^= M32[1];
+			S32[2] ^= M32[2];	
+			M32 = M32 + 3;
+			C32[0] = S32[0];
+			C32[1] = S32[1];
+			C32[2] = S32[2];
+			C32 = C32 + 3;
+			permutation384(S, 40, constant7);
+		}
+		v1 = mlen % 12;
+		last_index = (v - 1) * 12;
+		for (i = 0; i < v1; i++) {
+			S[i] ^= M[last_index + i];
+			c[last_index + i] = S[i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation384(S, 44, constant7);
+
+	// return tag
+	memcpy(c + mlen, S, CRYPTO_ABYTES);
+	*clen = mlen + CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	if (M != NULL) {
+		free(M);
+	}
+	return 0;
+}
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k)
+{
+	unsigned int u = 0;
+	unsigned int v = 0;
+	unsigned int v1 = 0;
+	unsigned int last_index = 0;
+	unsigned int i;
+	unsigned char *A = NULL;
+	unsigned char S[48];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	*mlen = 0;
+	if (clen < CRYPTO_ABYTES) {
+		return -1;
+	}
+
+	// pad associated data
+	if (adlen != 0) {
+		u = adlen / 12 + 1;
+		A = malloc(u * 12);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u * 12);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+	
+	M32 = (unsigned int *)m;
+	C32 = (unsigned int *)c;
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	permutation384(S, 76, constant7);
+	S32 = (unsigned int *)S;
+
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			S32[2] ^= A32[2];
+			A32 = A32 + 3;
+			permutation384(S, 40, constant7);
+		}
+	}
+	S[47] ^= 0x80;
+
+	// Encryption processiong 	ciphertext data
+	if (clen != CRYPTO_ABYTES) {
+		C32 = (unsigned int *)c;
+		v = (clen - CRYPTO_ABYTES) / 12  + 1;
+		for (i = 0; i < v - 1; i++) {
+			M32[0] = S32[0] ^ C32[0];
+			M32[1] = S32[1] ^ C32[1];
+			M32[2] = S32[2] ^ C32[2];
+			S32[0] = C32[0];
+			S32[1] = C32[1];
+			S32[2] = C32[2];
+			M32 = M32 + 3;
+			C32 = C32 + 3;
+			permutation384(S, 40, constant7);
+		}
+		v1 = (clen - CRYPTO_ABYTES) % 12;
+		last_index = (v - 1) * 12;
+		for (i = 0; i < v1; i++) {
+			m[last_index + i] = S[i] ^ c[last_index + i];
+			S[i] = c[last_index + i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation384(S, 44, constant7);
+
+	// return -1 if verification fails
+	for (i = 0; i < CRYPTO_ABYTES; i++) {
+		if (c[clen - CRYPTO_ABYTES + i] != S[i]) {
+			memset(m, 0, clen - CRYPTO_ABYTES);
+			return -1;
+		}
+	}
+	*mlen = clen - CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	return 0;
+}
+#else
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+												const unsigned char *ad, unsigned long long adlen,
+												const unsigned char *nsec, const unsigned char *npub,
+												const unsigned char *k) {
+	return 0;
+}
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k) {
+	return 0;
+}
+#endif
--- a/knot/Implementations/crypto_aead/knot192/armcortexm_2/api.h
+++ b/knot/Implementations/crypto_aead/knot192/armcortexm_2/api.h
+#define CRYPTO_KEYBYTES 24
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 24
+#define CRYPTO_ABYTES 24
+#define CRYPTO_NOOVERLAP 1
+
--- a/knot/Implementations/crypto_aead/knot192/armcortexm_2/auxFormat.h
+++ b/knot/Implementations/crypto_aead/knot192/armcortexm_2/auxFormat.h
+//#include<malloc.h>
+#include"crypto_aead.h"
+#include"api.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#define U32BIG(x) (x)
+#define U16BIG(x) (x)
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+#define LOTR32(x,n) (((x)<<(n))|((x)>>(32-(n))))
+
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+#define sbox(a, b, c, d, e, f, g, h)                                                                            \
+{                                                                                                                             \
+	t1 = ~a; t2 = b & t1;t3 = c ^ t2; h = d ^ t3; t5 = b | c; t6 = d ^ t1; g = t5 ^ t6; t8 = b ^ d; t9 = t3 & t6; e = t8 ^ t9; t11 = g & t8; f = t3 ^ t11; \
+}
+
+//////////////////puck begin
+//&:5   <<:4    |:4
+#define puckU32ToThree(x){\
+x &= 0x92492492;\
+x = (x | (x << 2)) & 0xc30c30c3;\
+x = (x | (x << 4)) & 0xf00f00f0;\
+x = (x | (x << 8)) & 0xff0000ff;\
+x = (x | (x << 16)) & 0xfff00000;\
+}
+#define unpuckU32ToThree(x){\
+x &= 0xfff00000;\
+x = (x | (x >> 16)) & 0xff0000ff;\
+x = (x | (x >> 8)) & 0xf00f00f0;\
+x = (x | (x >> 4)) & 0xc30c30c3;\
+x = (x | (x >> 2)) & 0x92492492;\
+}
+//u32 t1 u32 t2 u8  t2_64 , t2_65 ;u32 temp1[3] = { 0 };u32 temp2[3] = { 0 };
+#define packU48FormatToThreePacket(  out,  in) {\
+t1 = (u32)U16BIG(*(u16*)(in + 4));	\
+t2 = U32BIG(*(u32*)(in));	\
+t2_64 = (in[3] & 0x80) >> 7, t2_65 = (in[3] & 0x40) >> 6;	\
+t1 = t1 << 1;	\
+t2 = t2 << 2;	\
+temp1[0] = t1; temp1[1] = t1 << 1; temp1[2] = t1 << 2;	\
+puckU32ToThree(temp1[0]);	\
+puckU32ToThree(temp1[1]);	\
+puckU32ToThree(temp1[2]);	\
+temp2[0] = t2; temp2[1] = t2 << 1; temp2[2] = t2 << 2;	\
+puckU32ToThree(temp2[0]);	\
+puckU32ToThree(temp2[1]);	\
+puckU32ToThree(temp2[2]);	\
+out[0] = (temp1[0] >> 11) | (temp2[0] >> 22);	\
+out[1] = (temp1[1] >> 11) | (((u32)t2_64) << 10) | (temp2[1] >> 22);	\
+out[2] = (temp1[2] >> 11) | (((u32)t2_65) << 10) | (temp2[2] >> 22);	\
+}
+
+
+/*
+void packU96FormatToThreePacket(u32 * out, u8 * in) {
+u32 t0 = U32BIG(((u32*)in)[2]);
+u32 t1 = U32BIG(((u32*)in)[1]);
+u32 t2 = U32BIG(((u32*)in)[0]);
+u8 t1_32 = (in[7] & 0x80) >> 7, t2_64 = (in[3] & 0x80) >> 7, t2_65 = (in[3] & 0x40) >> 6;
+t1 = t1 << 1;
+t2 = t2 << 2;
+u32 temp0[3] = { 0 };
+temp0[0] = t0; temp0[1] = t0 << 1; temp0[2] = t0 << 2;
+puckU32ToThree(temp0[0]);
+puckU32ToThree(temp0[1]);
+puckU32ToThree(temp0[2]);
+u32 temp1[3] = { 0 };
+temp1[0] = t1; temp1[1] = t1 << 1; temp1[2] = t1 << 2;
+puckU32ToThree(temp1[0]);
+puckU32ToThree(temp1[1]);
+puckU32ToThree(temp1[2]);
+u32 temp2[3] = { 0 };
+temp2[0] = t2; temp2[1] = t2 << 1; temp2[2] = t2 << 2;
+puckU32ToThree(temp2[0]);
+puckU32ToThree(temp2[1]);
+puckU32ToThree(temp2[2]);
+out[0] = (temp0[0]) | (temp1[0] >> 11) | (temp2[0] >> 22);
+out[1] = (temp0[1]) | (temp1[1] >> 11) | (((u32)t2_64) << 10) | (temp2[1] >> 22);
+out[2] = (temp0[2]) | (((u32)t1_32) << 21) | (temp1[2] >> 11) | (((u32)t2_65) << 10) | (temp2[2] >> 22);
+}
+*/
+//t9  t1  t2  t1_32  t2_64  t2_65 temp0[3] temp1[3] temp2[3]
+#define packU96FormatToThreePacket(out, in) {\
+t9 = U32BIG(((u32*)in)[2]);	\
+t1 = U32BIG(((u32*)in)[1]);	\
+t2 = U32BIG(((u32*)in)[0]);	\
+t1_32 = (in[7] & 0x80) >> 7, t2_64 = (in[3] & 0x80) >> 7, t2_65 = (in[3] & 0x40) >> 6;	\
+t1 = t1 << 1;	\
+t2 = t2 << 2;	\
+temp0[0] = t9; temp0[1] = t9 << 1; temp0[2] = t9 << 2;	\
+puckU32ToThree(temp0[0]);	\
+puckU32ToThree(temp0[1]);	\
+puckU32ToThree(temp0[2]);	\
+temp1[0] = t1; temp1[1] = t1 << 1; temp1[2] = t1 << 2;	\
+puckU32ToThree(temp1[0]);	\
+puckU32ToThree(temp1[1]);	\
+puckU32ToThree(temp1[2]);	\
+temp2[0] = t2; temp2[1] = t2 << 1; temp2[2] = t2 << 2;	\
+puckU32ToThree(temp2[0]);	\
+puckU32ToThree(temp2[1]);	\
+puckU32ToThree(temp2[2]);	\
+out[0] = (temp0[0]) | (temp1[0] >> 11) | (temp2[0] >> 22);	\
+out[1] = (temp0[1]) | (temp1[1] >> 11) | (((u32)t2_64) << 10) | (temp2[1] >> 22);	\
+out[2] = (temp0[2]) | (((u32)t1_32) << 21) | (temp1[2] >> 11) | (((u32)t2_65) << 10) | (temp2[2] >> 22);	\
+}
+/*
+void unpackU96FormatToThreePacket(u8 * out, u32 * in) {
+u32 temp0[3] = { 0 };
+u32 temp1[3] = { 0 };
+u32 temp2[3] = { 0 };
+u32 t1_32, t2_64, t2_65;
+u32 t0, t1, t2;
+temp0[0] = in[0] & 0xffe00000;
+temp1[0] = (in[0] & 0x001ffc00) << 11;
+temp2[0] = (in[0] & 0x000003ff) << 22;
+temp0[1] = in[1] & 0xffe00000;
+temp1[1] = (in[1] & 0x001ff800) << 11;
+t2_64 = ((in[1] & 0x00000400) << 21);
+temp2[1] = (in[1] & 0x000003ff) << 22;
+temp0[2] = in[2] & 0xffc00000;
+t1_32 = ((in[2] & 0x00200000) << 10);
+temp1[2] = (in[2] & 0x001ff800) << 11;
+t2_65 = ((in[2] & 0x00000400) << 20);
+temp2[2] = (in[2] & 0x000003ff) << 22;
+unpuckU32ToThree(temp0[0]);
+unpuckU32ToThree(temp0[1]);
+unpuckU32ToThree(temp0[2]);
+t0 = temp0[0] | temp0[1] >> 1 | temp0[2] >> 2;
+unpuckU32ToThree(temp1[0]);
+unpuckU32ToThree(temp1[1]);
+unpuckU32ToThree(temp1[2]);
+t1 = t1_32 | ((temp1[0] | temp1[1] >> 1 | temp1[2] >> 2) >> 1);
+unpuckU32ToThree(temp2[0]);
+unpuckU32ToThree(temp2[1]);
+unpuckU32ToThree(temp2[2]);
+t2 = t2_65 | t2_64 | ((temp2[0] | temp2[1] >> 1 | temp2[2] >> 2) >> 2);
+*(u32*)(out) = U32BIG(t2);
+*(u32*)(out + 4) = U32BIG(t1);
+*(u32*)(out + 8) = U32BIG(t0);
+}
+*/
+//u32 temp0[3] = { 0 };u32 temp1[3] = { 0 };u32 temp2[3] = { 0 };u32 t1_32, t2_64, t2_65;t9,t1,t2,
+#define unpackU96FormatToThreePacket( out, in) {\
+temp0[0] = in[0] & 0xffe00000;	\
+temp1[0] = (in[0] & 0x001ffc00) << 11;	\
+temp2[0] = (in[0] & 0x000003ff) << 22;	\
+temp0[1] = in[1] & 0xffe00000;	\
+temp1[1] = (in[1] & 0x001ff800) << 11;	\
+t2_64 = ((in[1] & 0x00000400) << 21);	\
+temp2[1] = (in[1] & 0x000003ff) << 22;	\
+temp0[2] = in[2] & 0xffc00000;	\
+t1_32 = ((in[2] & 0x00200000) << 10);	\
+temp1[2] = (in[2] & 0x001ff800) << 11;	\
+t2_65 = ((in[2] & 0x00000400) << 20);	\
+temp2[2] = (in[2] & 0x000003ff) << 22;	\
+unpuckU32ToThree(temp0[0]);	\
+unpuckU32ToThree(temp0[1]);	\
+unpuckU32ToThree(temp0[2]);	\
+t9 = temp0[0] | temp0[1] >> 1 | temp0[2] >> 2;	\
+unpuckU32ToThree(temp1[0]);	\
+unpuckU32ToThree(temp1[1]);	\
+unpuckU32ToThree(temp1[2]);	\
+t1 = t1_32 | ((temp1[0] | temp1[1] >> 1 | temp1[2] >> 2) >> 1);	\
+unpuckU32ToThree(temp2[0]);	\
+unpuckU32ToThree(temp2[1]);	\
+unpuckU32ToThree(temp2[2]);	\
+t2 = t2_65 | t2_64 | ((temp2[0] | temp2[1] >> 1 | temp2[2] >> 2) >> 2);	\
+*(u32*)(out) = U32BIG(t2);	\
+*(u32*)(out + 4) = U32BIG(t1);	\
+*(u32*)(out + 8) = U32BIG(t9);	\
+}
+
+#define U96_BIT_LOTR32_1(t0,t1,t2,t3,t4,t5){\
+t3= t1;\
+t4 = t2;\
+t5 = LOTR32(t0, 1); \
+}
+#define U96_BIT_LOTR32_8(t0,t1,t2,t3,t4,t5){\
+t3= LOTR32(t2, 2);\
+t4 =LOTR32(t0, 3);\
+t5 = LOTR32(t1, 3); \
+}
+//55=3*18+1
+#define U96_BIT_LOTR32_55(t0,t1,t2,t3,t4,t5){\
+t3= LOTR32(t1, 18); \
+t4 = LOTR32(t2, 18);\
+t5 = LOTR32(t0, 19); \
+}
+/*
+s0  s1  s2
+s3  s4  s5
+s6  s7  s8
+s9 s10 s11
+*/
+#define ROUND384(lunNum) {\
+s[0] ^= (constant7Format[lunNum] >> 6) & 0x3;\
+s[1] ^= (constant7Format[lunNum] >> 3) & 0x7;\
+s[2] ^= constant7Format[lunNum] & 0x7;\
+sbox(s[0], s[3], s[6], s[9] , s_temp[0], s_temp[3], s_temp[6], s_temp[9]);\
+sbox(s[1], s[4], s[7], s[10], s_temp[1], s_temp[4], s_temp[7], s_temp[10]);\
+sbox(s[2], s[5], s[8], s[11], s_temp[2], s_temp[5], s_temp[8], s_temp[11]);\
+s[0] = s_temp[0], s[1] = s_temp[1], s[2] = s_temp[2];\
+U96_BIT_LOTR32_1(s_temp[3], s_temp [4], s_temp[ 5], s[3],  s[4], s[5]);\
+U96_BIT_LOTR32_8(s_temp[6], s_temp [7], s_temp[ 8], s[6],  s[7], s[8]);\
+U96_BIT_LOTR32_55(s_temp[9], s_temp[10], s_temp[11], s[9], s[10], s[11]);\
+}
+#define ROUND384Full(lunNum) {\
+printf("  constant7Format[%d]=%08x\n", lunNum, constant7Format[lunNum]);\
+s[0] ^= (constant7Format[lunNum] >> 6) & 0x3;\
+s[1] ^= (constant7Format[lunNum] >> 3) & 0x7;\
+s[2] ^= constant7Format[lunNum] & 0x7;\
+	printfU96Format("addition of round constant output",s);\
+sbox(s[0], s[3], s[6], s[9] , s_temp[0], s_temp[3], s_temp[6], s_temp[9]);\
+sbox(s[1], s[4], s[7], s[10], s_temp[1], s_temp[4], s_temp[7], s_temp[10]);\
+sbox(s[2], s[5], s[8], s[11], s_temp[2], s_temp[5], s_temp[8], s_temp[11]);\
+	printfU96Format("substitution layer output",s_temp);\
+s[0] = s_temp[0], s[1] = s_temp[1], s[2] = s_temp[2];\
+U96_BIT_LOTR32_1(s_temp[3], s_temp [4], s_temp[ 5], s[3],  s[4], s[5]);\
+U96_BIT_LOTR32_8(s_temp[6], s_temp [7], s_temp[ 8], s[6],  s[7], s[8]);\
+U96_BIT_LOTR32_55(s_temp[9], s_temp[10], s_temp[11], s[9], s[10], s[11]);\
+printfU96Format("linear diffusion layer output", s);\
+}
+
+void printBinarySimp(unsigned char * str, u8 *a, int len);
+//void packU96FormatToThreePacketFull(unsigned int * out, u8 * in);
+//void unpackU96FormatToThreePacketFull(u8 * out, unsigned int * in);
+//void packU96FormatToThreePacket(u32 * out, u8 * in);
+//void unpackU96FormatToThreePacket(u8 * out, u32 * in);
+void printU32State(char name[], unsigned int* var, long len);
+void printfU96Format(char name[], unsigned int * s);
+//////////////////puck end
+void printU8(char name[], u8 var[], int len, int offset);
+void printfU96Format(char name[], u32 * s);
+
+////////////constant begin//
+//unsigned char  constant7Format[127];
+void puckU8FormatToThreePacket(u8 in, u8 *out);
+//void test_puckU8FormatToThreePacket();
+////////////constant  end//
+
+
+static void permutation384(unsigned int *in, int rounds, unsigned char *rc);
--- a/knot/Implementations/crypto_aead/knot192/armcortexm_2/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot192/armcortexm_2/crypto_aead.h
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
--- a/knot/Implementations/crypto_aead/knot192/armcortexm_2/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot192/armcortexm_2/encrypt.c
+
+#include"auxFormat.h"
+
+#define aead_RATE (96 / 8)
+#define PR0_ROUNDS 76
+#define PR_ROUNDS 40
+#define PRF_ROUNDS 44
+unsigned char  constant7Format[127] = {
+	/*constant7Format[127]:*/
+0x01,0x08,0x40,0x02,0x10,0x80,0x05,0x09,0x48,0x42,0x12,0x90,
+0x85,0x0c,0x41,0x0a,0x50,0x82,0x15,0x89,0x4d,0x4b,0x5a,0xd2,
+0x97,0x9c,0xc4,0x06,0x11,0x88,0x45,0x0b,0x58,0xc2,0x17,0x99,
+0xcd,0x4e,0x53,0x9a,0xd5,0x8e,0x54,0x83,0x1d,0xc9,0x4f,0x5b,
+0xda,0xd7,0x9e,0xd4,0x86,0x14,0x81,0x0d,0x49,0x4a,0x52,0x92,
+0x95,0x8c,0x44,0x03,0x18,0xc0,0x07,0x19,0xc8,0x47,0x1b,0xd8,
+0xc7,0x1e,0xd1,0x8f,0x5c,0xc3,0x1f,0xd9,0xcf,0x5e,0xd3,0x9f,
+0xdc,0xc6,0x16,0x91,0x8d,0x4c,0x43,0x1a,0xd0,0x87,0x1c,0xc1,
+0x0f,0x59,0xca,0x57,0x9b,0xdd,0xce,0x56,0x93,0x9d,0xcc,0x46,
+0x13,0x98,0xc5,0x0e,0x51,0x8a,0x55,0x8b,0x5d,0xcb,0x5f,0xdb,
+0xdf,0xde,0xd6,0x96,0x94,0x84,0x04, };
+
+/* State
+ * w8  w4 w0
+ * w9  w5 w1
+ * w10 w6 w2
+ * w11 w7 w3
+ */
+ static void permutation384(unsigned int *in, int rounds, unsigned char *rc) {
+
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11;
+	uint32_t s0, s1, s2;
+	uint32_t i=0;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w8,     [in, #8]      \n\t"
+		"ldr     w1,     [in, #12]     \n\t"
+		"ldr     w5,     [in, #16]     \n\t"
+		"ldr     w9,     [in, #20]     \n\t"
+		"ldr     w2,     [in, #24]     \n\t"
+		"ldr     w6,     [in, #28]     \n\t"
+		"ldr     w10,    [in, #32]     \n\t"
+		"ldr     w3,     [in, #36]     \n\t"
+		"ldr     w7,     [in, #40]     \n\t"
+		"ldr     w11,    [in, #44]     \n\t"	
+	"enc_loop:                       \n\t"
+    "/*add round const   s0 s1*/           \n\t"
+		"ldrb    s0,     [rc]          \n\t"	
+		"LSR     s1,     s0, #6       \n\t"	
+		"and    s1,     s1, 0x3        \n\t"
+		"LSR     s2,     s0, #3       \n\t"	
+		"and    s2,     s2, 0x7        \n\t"
+		"and    s0,     s0, 0x7        \n\t"
+	  "eors    w8,     w8, s0        \n\t"
+	  "eors    w4,     w4, s2        \n\t"
+	  "eors    w0,     w0, s1        \n\t"
+    "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+		"/*sbox third column*/         \n\t"
+		"mvns    w8,     w8            \n\t"
+		"ands    s0,     w9,  w8       \n\t"
+		"eors    s0,     w10, s0       \n\t"
+		"orrs    w10,    w9,  w10      \n\t"
+		"eors    w8,     w11, w8       \n\t"
+		"eors    w10,    w10, w8       \n\t"
+		"eors    s1,     w9,  w11      \n\t"
+		"eors    w11,    w11, s0       \n\t"
+		"ands    w8,     s0,  w8       \n\t"
+		"eors    w8,     s1,  w8       \n\t"
+		"ands    w9,     w10, s1       \n\t"
+		"eors    w9,     s0,  w9       \n\t"
+    "/*rotate shift left 1 bit  [w9 w5 w1-> (w1,1) w9 w5] */   \n\t"
+		"mov    s0,     w1       \n\t"
+		"mov    w1,     w5       \n\t"
+		"mov    w5,     w9       \n\t"
+		"ROR    w9,     s0, #31        \n\t"
+    "/*rotate shift left 8 bits [w10 w6 w2-> w6,3)  (w2,3)  ( w10,2)]*/  \n\t"
+		"mov    s0,     w10       \n\t"
+		"ROR    w10,    w6 , #29      \n\t"
+		"ROR    w6,     w2  , #29      \n\t"
+		"ROR    w2,     s0, #30        \n\t"
+    "/*rotate shift left 55 bit  [w11 w7 w3-> w3,13)  (w11,14)  ( w7,14)] */   \n\t"
+		"mov    s0,     w3       \n\t"
+		"ROR    w3,     w7 , #14      \n\t"
+		"ROR    w7,     w11 , #14      \n\t"
+		"ROR    w11,     s0, #13        \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc,  #1       \n\t"
+		"subs    rounds, rounds, #1    \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]          \n\t"
+		"str     w4,     [in, #4]      \n\t"
+		"str     w8,     [in, #8]      \n\t"
+		"str     w1,     [in, #12]     \n\t"
+		"str     w5,     [in, #16]     \n\t"
+		"str     w9,     [in, #20]     \n\t"
+		"str     w2,     [in, #24]     \n\t"
+		"str     w6,     [in, #28]     \n\t"
+		"str     w10,    [in, #32]     \n\t"
+		"str     w3,     [in, #36]     \n\t"
+		"str     w7,     [in, #40]     \n\t"
+		"str     w11,    [in, #44]     \n\t"
+	);
+}
+
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec, const unsigned char *npub,
+	const unsigned char *k) {
+
+	u8 i;
+	u32 s[12] = { 0 };
+	u32 dataFormat[3] = { 0 };
+	u8 tempData[12] = { 0 };
+	u32 s_temp[12] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+	u32 t1_32, t2_64, t2_65;
+	u32 temp0[3] = { 0 };
+	u32 temp1[3] = { 0 };
+	u32 temp2[3] = { 0 };
+	*clen = mlen + CRYPTO_ABYTES;
+	// initialization
+	packU96FormatToThreePacket(s, npub);
+	packU96FormatToThreePacket((s + 3), (npub + 12));
+	packU96FormatToThreePacket((s + 6), k);
+	packU96FormatToThreePacket((s + 9), (k + 12));
+permutation384(s,PR0_ROUNDS,constant7Format);
+	// process associated data
+	if (adlen) {
+		while (adlen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+permutation384(s,PR_ROUNDS,constant7Format);
+			adlen -= aead_RATE;
+			ad += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, ad, adlen);
+		tempData[adlen] = 0x01;
+		packU96FormatToThreePacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		permutation384(s,PR_ROUNDS,constant7Format);
+	}
+	s[9] ^= 0x80000000;
+	if (mlen) {
+		while (mlen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, m);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+			unpackU96FormatToThreePacket(c, s);
+			permutation384(s,PR_ROUNDS,constant7Format);
+			mlen -= aead_RATE;
+			m += aead_RATE;
+			c += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, m, mlen);
+		tempData[mlen] = 0x01;
+		packU96FormatToThreePacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		unpackU96FormatToThreePacket(tempData, s);
+		memcpy(c, tempData, mlen);
+		c += mlen;
+	}
+	// finalization
+	permutation384(s,PRF_ROUNDS,constant7Format);
+	// return tag
+	unpackU96FormatToThreePacket(c, s);
+	unpackU96FormatToThreePacket((c + 12), (s + 3));
+	return 0;
+}
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec, const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub, const unsigned char *k) {
+	u8 i, j;
+	u32 s[12] = { 0 };
+	u32 dataFormat[6] = { 0 };
+	u32 dataFormat_1[3] = { 0 };
+	u8 tempData[12] = { 0 };
+	u8 tempU8[48] = { 0 };
+	u32 s_temp[12] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+	u32 t1_32, t2_64, t2_65;
+	u32 temp0[3] = { 0 };
+	u32 temp1[3] = { 0 };
+	u32 temp2[3] = { 0 };	
+	*mlen = clen - CRYPTO_ABYTES;
+	if (clen < CRYPTO_ABYTES)
+		return -1;
+	// initialization
+	packU96FormatToThreePacket(s, npub);
+	packU96FormatToThreePacket((s + 3), (npub + 12));
+	packU96FormatToThreePacket((s + 6), k);
+	packU96FormatToThreePacket((s + 9), (k + 12));
+permutation384(s,PR0_ROUNDS,constant7Format);
+	// process associated data
+	if (adlen) {
+		while (adlen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+permutation384(s,PR_ROUNDS,constant7Format);
+			adlen -= aead_RATE;
+			ad += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+
+		memcpy(tempData, ad, adlen);
+		tempData[adlen] = 0x01;
+		packU96FormatToThreePacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		permutation384(s,PR_ROUNDS,constant7Format);
+	}
+	s[9] ^= 0x80000000;
+	clen -= CRYPTO_ABYTES;
+	if (clen) {
+		while (clen >= aead_RATE) {
+			packU96FormatToThreePacket(dataFormat, c);
+			dataFormat_1[0] = s[0] ^ dataFormat[0];
+			dataFormat_1[1] = s[1] ^ dataFormat[1];
+			dataFormat_1[2] = s[2] ^ dataFormat[2];
+			unpackU96FormatToThreePacket(m, dataFormat_1);
+			s[0] = dataFormat[0];
+			s[1] = dataFormat[1];
+			s[2] = dataFormat[2];
+			permutation384(s,PR_ROUNDS,constant7Format);
+			clen -= aead_RATE;
+			m += aead_RATE;
+			c += aead_RATE;
+		}
+		unpackU96FormatToThreePacket(tempU8, s);
+		for (i = 0; i < clen; ++i, ++m, ++c)
+		{
+			*m = tempU8[i] ^ *c;
+			tempU8[i] = *c;
+		}
+		tempU8[i] ^= 0x01;
+		packU96FormatToThreePacket(s, tempU8);
+	}
+	// finalization		
+	permutation384(s,PRF_ROUNDS,constant7Format);
+	// return tag	
+	packU96FormatToThreePacket(dataFormat, c);
+	packU96FormatToThreePacket((dataFormat + 3), (c + 12));
+	if (dataFormat[0] != s[0] || dataFormat[1] != s[1] || dataFormat[2] != s[2] || dataFormat[3] != s[3]
+		|| dataFormat[4] != s[4] || dataFormat[5] != s[5]) {
+		return -1;
+	}
+	return 0;
+}
--- a/knot/Implementations/crypto_aead/knot256/armcortexm_1/api.h
+++ b/knot/Implementations/crypto_aead/knot256/armcortexm_1/api.h
+#ifndef KNOT_API_H
+#define KNOT_API_H
+//k=n=tag=256  b=512 r=128 c=384
+#define CRYPTO_KEYBYTES 32
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 32
+#define CRYPTO_ABYTES 32
+#define CRYPTO_NOOVERLAP 1
+#endif
--- a/knot/Implementations/crypto_aead/knot256/armcortexm_1/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot256/armcortexm_1/crypto_aead.h
+#ifndef KNOT_CRYPTO_AEAD_H
+#define KNOT_CRYPTO_AEAD_H
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k);
+#endif
--- a/knot/Implementations/crypto_aead/knot256/armcortexm_1/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot256/armcortexm_1/encrypt.c
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "crypto_aead.h"
+#include "api.h"
+
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+
+#define KNOT_CIPHER 1
+#if defined(KNOT_CIPHER) && (KNOT_CIPHER == 1)
+unsigned char constant7[127] = {
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x41, 0x03, 0x06,
+		0x0c, 0x18, 0x30, 0x61, 0x42, 0x05, 0x0a, 0x14, 0x28, 0x51, 0x23, 0x47,
+		0x0f, 0x1e, 0x3c, 0x79, 0x72, 0x64, 0x48, 0x11, 0x22, 0x45, 0x0b, 0x16,
+		0x2c, 0x59, 0x33, 0x67, 0x4e, 0x1d, 0x3a, 0x75, 0x6a, 0x54, 0x29, 0x53,
+		0x27, 0x4f, 0x1f, 0x3e, 0x7d, 0x7a, 0x74, 0x68, 0x50, 0x21, 0x43, 0x07,
+		0x0e, 0x1c, 0x38, 0x71, 0x62, 0x44, 0x09, 0x12, 0x24, 0x49, 0x13, 0x26,
+		0x4d, 0x1b, 0x36, 0x6d, 0x5a, 0x35, 0x6b, 0x56, 0x2d, 0x5b, 0x37, 0x6f,
+		0x5e, 0x3d, 0x7b, 0x76, 0x6c, 0x58, 0x31, 0x63, 0x46, 0x0d, 0x1a, 0x34,
+		0x69, 0x52, 0x25, 0x4b, 0x17, 0x2e, 0x5d, 0x3b, 0x77, 0x6e, 0x5c, 0x39,
+		0x73, 0x66, 0x4c, 0x19, 0x32, 0x65, 0x4a, 0x15, 0x2a, 0x55, 0x2b, 0x57,
+		0x2f, 0x5f, 0x3f, 0x7f, 0x7e, 0x7c, 0x78, 0x70, 0x60, 0x40 };
+
+#define load64(x1, x0, in)               \
+    "ldr     x0,     [in]          \n\t" \
+		"ldr     x1,     [in, #4]      \n\t"
+
+#define store64(x1, x0, out)             \
+		"str     x0,     [out]         \n\t" \
+		"str     x1,     [out, #4]     \n\t"
+
+/* State
+ * w12 w8  w4  w0
+ * w13 w9  w5  w1
+ * w14 w10 w6  w2
+ * w15 w11 w7  w3
+ *
+ * Sbox
+	t1  = ~a;
+	t2  = b  & t1;
+	t3  = c  ^ t2; 
+	h   = d  ^ t3; 
+	t5  = b  | c; 
+	t6  = d  ^ t1; 
+	g   = t5 ^ t6; 
+	t8  = b  ^ d; 
+	t9  = t3 & t6; 
+	e   = t8 ^ t9; 
+	t11 = g  & t8; 
+	f   = t3 ^ t11;
+ *
+ * Sbox after change
+	a  = ~a; 
+	s0  = b  & a;
+	s0  = c  ^ s0;
+	c  = b  | c; 
+	a  = d  ^ a; 
+	c   = c ^ a; 
+	s1  = b  ^ d; 
+	d   = d  ^ s0;
+	a  = s0 & a; 
+	a   = s1 ^ a; 
+	b = c  & s1; 
+	b   = s0 ^ b;
+ */
+static void permutation512(unsigned char *in, int rounds, unsigned char *rc) {
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint32_t w8, w9, w10, w11, w12, w13, w14, w15;
+	uint32_t s0, s1;
+	uint32_t one = 0x1;
+	uint32_t ffff = 0xffff;
+	uint32_t value;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w8,     [in, #8]      \n\t"
+		"ldr     w12,    [in, #12]     \n\t"
+		"ldr     w1,     [in, #16]     \n\t"
+		"ldr     w5,     [in, #20]     \n\t"
+		"ldr     w9,     [in, #24]     \n\t"
+		"ldr     w13,    [in, #28]     \n\t"
+		"ldr     w2,     [in, #32]     \n\t"
+		"ldr     w6,     [in, #36]     \n\t"
+		"ldr     w10,    [in, #40]     \n\t"
+		"ldr     w14,    [in, #44]     \n\t"
+		"ldr     w3,     [in, #48]     \n\t"
+		"ldr     w7,     [in, #52]     \n\t"
+		"ldr     w11,    [in, #56]     \n\t"
+		"ldr     w15,    [in, #60]     \n\t"
+		"mov     s0,     0xfff         \n\t"
+		"mov     value,  0x1fff        \n\t"
+		"lsl     value,  value, #12    \n\t"
+		"eors    value,  value, s0     \n\t"
+	"enc_loop:                       \n\t"
+    "/*add round const*/           \n\t"
+		"ldrb    s0,     [rc]          \n\t"
+	  "eors    w0,     w0, s0        \n\t"
+    "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+		"/*sbox third column*/         \n\t"
+		"mvns    w8,     w8            \n\t"
+		"ands    s0,     w9,  w8       \n\t"
+		"eors    s0,     w10, s0       \n\t"
+		"orrs    w10,    w9,  w10      \n\t"
+		"eors    w8,     w11, w8       \n\t"
+		"eors    w10,    w10, w8       \n\t"
+		"eors    s1,     w9,  w11      \n\t"
+		"eors    w11,    w11, s0       \n\t"
+		"ands    w8,     s0,  w8       \n\t"
+		"eors    w8,     s1,  w8       \n\t"
+		"ands    w9,     w10, s1       \n\t"
+		"eors    w9,     s0,  w9       \n\t"
+		"/*sbox forth column*/         \n\t"
+		"mvns    w12,    w12           \n\t"
+		"ands    s0,     w13, w12      \n\t"
+		"eors    s0,     w14, s0       \n\t"
+		"orrs    w14,    w13, w14      \n\t"
+		"eors    w12,    w15, w12      \n\t"
+		"eors    w14,    w14, w12      \n\t"
+		"eors    s1,     w13, w15      \n\t"
+		"eors    w15,    w15, s0       \n\t"
+		"ands    w12,    s0,  w12      \n\t"
+		"eors    w12,    s1,  w12      \n\t"
+		"ands    w13,    w14, s1       \n\t"
+		"eors    w13,    s0,  w13      \n\t"
+    "/*rotate shift left 1 bit*/   \n\t"
+		"ror     s0,     w1, #31       \n\t"
+		"ands    s0,     s0, one       \n\t"
+		"lsl     w1,     w1, #1        \n\t"
+		"ror     s1,     w13,#31       \n\t"
+		"ands    s1,     s1, one       \n\t"
+		"eors    w1,     w1, s1        \n\t"
+		"ror     s1,     w9, #31       \n\t"
+		"ands    s1,     s1, one       \n\t"
+		"lsl     w13,    w13,#1        \n\t"
+		"eors    w13,    w13,s1        \n\t"
+		"ror     s1,     w5, #31       \n\t"
+		"ands    s1,     s1, one       \n\t"
+		"lsl     w9,     w9, #1        \n\t"
+		"eors    w9,     w9, s1        \n\t"
+		"lsl     w5,     w5, #1        \n\t"
+		"eors    w5,     w5, s0        \n\t"
+    "/*rotate shift left 16 bits*/ \n\t"
+		"ror     s0,     w2, #16       \n\t"
+		"ands    s0,     s0, ffff      \n\t"
+		"lsl     w2,     w2, #16       \n\t"
+		"ror     s1,     w14,#16       \n\t"
+		"ands    s1,     s1, ffff      \n\t"
+		"eors    w2,     w2, s1        \n\t"
+		"ror     s1,     w10,#16       \n\t"
+		"ands    s1,     s1, ffff      \n\t"
+		"lsl     w14,    w14,#16       \n\t"
+		"eors    w14,    w14,s1        \n\t"
+		"ror     s1,     w6, #16       \n\t"
+		"ands    s1,     s1, ffff      \n\t"
+		"lsl     w10,    w10,#16       \n\t"
+		"eors    w10,    w10,s1        \n\t"
+		"lsl     w6,     w6, #16       \n\t"
+		"eors    w6,     w6, s0        \n\t"
+    "/*rotate shift left 25 bits*/ \n\t"
+		"ror     s0,     w3, #7        \n\t"
+		"ands    s0,     s0, value     \n\t"
+		"lsl     w3,     w3, #25       \n\t"
+		"ror     s1,     w15,#7        \n\t"
+		"ands    s1,     s1, value     \n\t"
+		"eors    w3,     w3, s1        \n\t"
+		"ror     s1,     w11,#7        \n\t"
+		"ands    s1,     s1, value     \n\t"
+		"lsl     w15,    w15,#25       \n\t"
+		"eors    w15,    w15,s1        \n\t"
+		"ror     s1,     w7, #7        \n\t"
+		"ands    s1,     s1, value     \n\t"
+		"lsl     w11,    w11,#25       \n\t"
+		"eors    w11,    w11,s1        \n\t"
+		"lsl     w7,     w7, #25       \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc,  #1       \n\t"
+		"subs    rounds, rounds, #1    \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]          \n\t"
+		"str     w4,     [in, #4]      \n\t"
+		"str     w8,     [in, #8]      \n\t"
+		"str     w12,    [in, #12]     \n\t"
+		"str     w1,     [in, #16]     \n\t"
+		"str     w5,     [in, #20]     \n\t"
+		"str     w9,     [in, #24]     \n\t"
+		"str     w13,    [in, #28]     \n\t"
+		"str     w2,     [in, #32]     \n\t"
+		"str     w6,     [in, #36]     \n\t"
+		"str     w10,    [in, #40]     \n\t"
+		"str     w14,    [in, #44]     \n\t"
+		"str     w3,     [in, #48]     \n\t"
+		"str     w7,     [in, #52]     \n\t"
+		"str     w11,    [in, #56]     \n\t"
+		"str     w15,    [in, #60]     \n\t"
+	);
+}
+
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *nsec, const unsigned char *npub,
+                        const unsigned char *k) {
+	unsigned int u = 0;
+	unsigned int v = 0;
+	unsigned int v1 = 0;
+	unsigned int last_index = 0;
+	unsigned int i;
+	unsigned char *A = NULL;
+	unsigned char *M = NULL;
+	unsigned char S[64];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	// pad associated data
+	if (adlen != 0) {
+		u = adlen / 16 + 1;
+		A = malloc(u * 16);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u * 16);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+
+	// pad plaintext data
+	if (mlen != 0) {
+		v = mlen / 16 + 1;
+		M = malloc(v * 16);
+		if (M == NULL) {
+			free(A);
+			return -1;
+		}
+		memset(M, 0, v * 16);
+		memcpy(M, m, mlen);
+		M[mlen] = 0x01;
+		M32 = (unsigned int *)M;
+	}
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	permutation512(S, 100, constant7);
+	S32 = (unsigned int *)S;
+ 
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			S32[2] ^= A32[2];
+			S32[3] ^= A32[3];
+			A32 = A32 + 4;
+			permutation512(S, 52, constant7);
+		}
+	}
+	S[63] ^= 0x80;
+
+	// Encryption processiong plaintext data
+	if (mlen != 0) {
+		C32 = (unsigned int *)c;
+		for (i = 0; i < v - 1; i++) {
+			S32[0] ^= M32[0];
+			S32[1] ^= M32[1];
+			S32[2] ^= M32[2];
+			S32[3] ^= M32[3];
+			M32 = M32 + 4;
+			C32[0] = S32[0];
+			C32[1] = S32[1];
+			C32[2] = S32[2];
+			C32[3] = S32[3];
+			C32 = C32 + 4;
+			permutation512(S, 52, constant7);
+		}
+		v1 = mlen % 16;
+		last_index = (v - 1) * 16;
+		for (i = 0; i < v1; i++) {
+			S[i] ^= M[last_index + i];
+			c[last_index + i] = S[i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation512(S, 56, constant7);
+
+	// return tag
+	memcpy(c + mlen, S, CRYPTO_ABYTES);
+	*clen = mlen + CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	if (M != NULL) {
+		free(M);
+	}
+	return 0;
+}
+
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k)
+{
+	unsigned int u = 0;
+	unsigned int v = 0;
+	unsigned int v1 = 0;
+	unsigned int last_index = 0;
+	unsigned int i;
+	unsigned char *A = NULL;
+	unsigned char S[64];
+	unsigned int *A32 = NULL;
+	unsigned int *M32 = NULL;
+	unsigned int *S32 = NULL;
+	unsigned int *C32 = NULL;
+
+	*mlen = 0;
+	if (clen < CRYPTO_ABYTES) {
+		return -1;
+	}
+
+	// pad associated data
+	if (adlen != 0) {
+		u = adlen / 16 + 1;
+		A = malloc(u * 16);
+		if (A == NULL) {
+			return -1;
+		}
+		memset(A, 0, u * 16);
+		memcpy(A, ad, adlen);
+		A[adlen] = 0x01;
+		A32 = (unsigned int *)A;
+	}
+	
+	M32 = (unsigned int *)m;
+	C32 = (unsigned int *)c;
+
+	// initalization
+	memcpy(S, npub, CRYPTO_NPUBBYTES);
+	memcpy(S + CRYPTO_NPUBBYTES, k, CRYPTO_KEYBYTES);
+	permutation512(S, 100, constant7);
+	S32 = (unsigned int *)S;
+
+	// processiong associated data
+	if (adlen != 0) {
+		for (i = 0; i < u; i++) {
+			S32[0] ^= A32[0];
+			S32[1] ^= A32[1];
+			S32[2] ^= A32[2];
+			S32[3] ^= A32[3];
+			A32 = A32 + 4;
+			permutation512(S, 52, constant7);
+		}
+	}
+	S[63] ^= 0x80;
+
+	// Encryption processiong 	ciphertext data
+	if (clen != CRYPTO_ABYTES) {
+		C32 = (unsigned int *)c;
+		v = (clen - CRYPTO_ABYTES) / 16 + 1;
+		for (i = 0; i < v - 1; i++) {
+			M32[0] = S32[0] ^ C32[0];
+			M32[1] = S32[1] ^ C32[1];
+			M32[2] = S32[2] ^ C32[2];
+			M32[3] = S32[3] ^ C32[3];
+			S32[0] = C32[0];
+			S32[1] = C32[1];
+			S32[2] = C32[2];
+			S32[3] = C32[3];
+			M32 = M32 + 4;
+			C32 = C32 + 4;
+			permutation512(S, 52, constant7);
+		}
+		v1 = (clen - CRYPTO_ABYTES) % 16;
+		last_index = (v - 1) * 16;
+		for (i = 0; i < v1; i++) {
+			m[last_index + i] = S[i] ^ c[last_index + i];
+			S[i] = c[last_index + i];
+		}
+		S[i] ^= 0x01;
+	}
+
+	// finalization
+	permutation512(S, 56, constant7);
+
+	// return -1 if verification fails
+	for (i = 0; i < CRYPTO_ABYTES; i++) {
+		if (c[clen - CRYPTO_ABYTES + i] != S[i]) {
+			memset(m, 0, clen - CRYPTO_ABYTES);
+			return -1;
+		}
+	}
+	*mlen = clen - CRYPTO_ABYTES;
+	if (A != NULL) {
+		free(A);
+	}
+	return 0;
+}
+#else
+int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
+                        const unsigned char *m, unsigned long long mlen,
+												const unsigned char *ad, unsigned long long adlen,
+												const unsigned char *nsec, const unsigned char *npub,
+												const unsigned char *k) {
+	return 0;
+}
+int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
+                        unsigned char *nsec,
+                        const unsigned char *c, unsigned long long clen,
+                        const unsigned char *ad, unsigned long long adlen,
+                        const unsigned char *npub, const unsigned char *k) {
+	return 0;
+}
+#endif
\ No newline at end of file
--- a/knot/Implementations/crypto_aead/knot256/armcortexm_2/api.h
+++ b/knot/Implementations/crypto_aead/knot256/armcortexm_2/api.h
+#define CRYPTO_KEYBYTES 32  //256/8=32
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 32
+#define CRYPTO_ABYTES 32
+#define CRYPTO_NOOVERLAP 1
+
--- a/knot/Implementations/crypto_aead/knot256/armcortexm_2/auxFormat.h
+++ b/knot/Implementations/crypto_aead/knot256/armcortexm_2/auxFormat.h
+//#include<malloc.h>
+#include"crypto_aead.h"
+#include"api.h"
+#include  <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#define U32BIG(x) (x)
+#define ARR_SIZE(a) (sizeof((a))/sizeof((a[0])))
+#define LOTR32(x,n) (((x)<<(n))|((x)>>(32-(n))))
+
+#define sbox(a, b, c, d, e, f, g, h)                                                                            \
+{                                                                                                                             \
+	t1 = ~a; t2 = b & t1;t3 = c ^ t2; h = d ^ t3; t5 = b | c; t6 = d ^ t1; g = t5 ^ t6; t8 = b ^ d; t9 = t3 & t6; e = t8 ^ t9; t11 = g & t8; f = t3 ^ t11; \
+}
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+void printfU128Format(char name[], u32 * in);
+void printU8(char name[], u8 var[], long len, int offset);
+
+
+//ʹt9
+#define puck32(in)\
+{\
+t9 = (in ^ (in >> 1)) & 0x22222222; in ^= t9 ^ (t9 << 1);\
+t9 = (in ^ (in >> 2)) & 0x0C0C0C0C; in ^= t9 ^ (t9 << 2);\
+t9 = (in ^ (in >> 4)) & 0x00F000F0; in ^= t9 ^ (t9 << 4);\
+t9 = (in ^ (in >> 8)) & 0x0000FF00; in ^= t9 ^ (t9 << 8);\
+}
+//ʹt9
+#define unpuck32(t0){\
+	t9 = (t0 ^ (t0 >> 8)) & 0x0000FF00, t0 ^= t9 ^ (t9 << 8); \
+	t9 = (t0 ^ (t0 >> 4)) & 0x00F000F0, t0 ^= t9 ^ (t9 << 4); \
+	t9 = (t0 ^ (t0 >> 2)) & 0x0C0C0C0C, t0 ^= t9 ^ (t9 << 2); \
+	t9 = (t0 ^ (t0 >> 1)) & 0x22222222, t0 ^= t9 ^ (t9 << 1); \
+}
+//u32 t1, t2, t3,t8, 
+#define packU128FormatToFourPacket(out,in) {\
+	     t8 = U32BIG(((u32*)in)[0]);	\
+		 t1 = U32BIG(((u32*)in)[1]);	\
+		 t2 = U32BIG(((u32*)in)[2]);	\
+		 t3 = U32BIG(((u32*)in)[3]);	\
+		puck32(t8); puck32(t8);	\
+		puck32(t1); puck32(t1);	\
+		puck32(t2); puck32(t2);	\
+		puck32(t3); puck32(t3);	\
+		out[3] = t3 & 0xff000000 | ((t2 >> 8) & 0x00ff0000) | ((t1 >> 16) & 0x0000ff00) | (t8 >> 24);	\
+		out[2] = ((t3 << 8) & 0xff000000) | (t2 & 0x00ff0000) | ((t1 >> 8) & 0x0000ff00) | ((t8 >> 16) & 0x000000ff);	\
+		out[1] = ((t3 << 16) & 0xff000000) | ((t2 << 8) & 0x00ff0000) | (t1 & 0x0000ff00) | ((t8 >> 8) & 0x000000ff);	\
+		out[0] = ((t3 << 24) & 0xff000000) | ((t2 << 16) & 0x00ff0000) | ((t1 << 8) & 0x0000ff00) | (t8 & 0x000000ff);	\
+}
+//u32 dataFormat[4],u32 t1, t2, t3,t8, 
+#define unpackU128FormatToFourPacket( out,  in) {\
+memcpy(dataFormat, in, sizeof(unsigned int) * 4);	\
+t3 = dataFormat[3] & 0xff000000 | ((dataFormat[2] >> 8) & 0x00ff0000) | ((dataFormat[1] >> 16) & 0x0000ff00) | (dataFormat[0] >> 24);	\
+t2 = ((dataFormat[3] << 8) & 0xff000000) | (dataFormat[2] & 0x00ff0000) | ((dataFormat[1] >> 8) & 0x0000ff00) | ((dataFormat[0] >> 16) & 0x000000ff);	\
+t1 = ((dataFormat[3] << 16) & 0xff000000) | ((dataFormat[2] << 8) & 0x00ff0000) | (dataFormat[1] & 0x0000ff00) | ((dataFormat[0] >> 8) & 0x000000ff);	\
+t8 = ((dataFormat[3] << 24) & 0xff000000) | ((dataFormat[2] << 16) & 0x00ff0000) | ((dataFormat[1] << 8) & 0x0000ff00) | (dataFormat[0] & 0x000000ff);	\
+unpuck32(t8); unpuck32(t8);	\
+unpuck32(t1); unpuck32(t1);	\
+unpuck32(t2); unpuck32(t2);	\
+unpuck32(t3); unpuck32(t3);	\
+((u32*)out)[0] = U32BIG(t8);	\
+((u32*)out)[1] = U32BIG(t1);	\
+((u32*)out)[2] = U32BIG(t2);	\
+((u32*)out)[3] = U32BIG(t3);	\
+}
+//u32 t1 ;u32 t2 = 
+#define packU64FormatToFourPacket(  out,   in) {\
+t1 = U32BIG(((u32*)in)[0]);	\
+t2 = U32BIG(((u32*)in)[1]);	\
+puck32(t1);	\
+puck32(t1);	\
+puck32(t2);	\
+puck32(t2);	\
+out[3] = ((t2 >> 16) & 0x0000ff00) | ((t1 >> 24));	\
+out[2] = ((t2 >> 8) & 0x0000ff00) | ((t1 >> 16) & 0x000000ff);	\
+out[1] = (t2 & 0x0000ff00) | ((t1 >> 8) & 0x000000ff);	\
+out[0] = ((t2 << 8) & 0x0000ff00) | (t1 & 0x000000ff);	\
+}
+#define BIT_LOTR32_1(t0,t1,t2,t3,t4,t5,t6,t7){\
+t4= LOTR32(t3, 1);\
+t5 = t0;\
+t6 = t1; \
+t7 = t2; \
+}
+#define BIT_LOTR32_16(t0,t1,t2,t3,t4,t5,t6,t7){\
+t4= LOTR32(t0, 4);\
+t5 = LOTR32(t1, 4);\
+t6 = LOTR32(t2, 4); \
+t7 = LOTR32(t3, 4); \
+}
+#define BIT_LOTR32_25(t0,t1,t2,t3,t4,t5,t6,t7){\
+t4= LOTR32(t3, 7);\
+t5 = LOTR32(t0, 6);\
+t6 = LOTR32(t1, 6); \
+t7 = LOTR32(t2, 6); \
+}
+
+#define ROUND512( arr,lunNum) {\
+s[3] ^= (arr[lunNum] >> 6) & 0x3;\
+s[2] ^= (arr[lunNum] >> 4) & 0x3;\
+s[1] ^= (arr[lunNum] >> 2) & 0x3;\
+s[0] ^= arr[lunNum] & 0x3;\
+sbox(s[0], s[4], s[8], s[12], s_temp[0], s_temp[4], s_temp[8], s_temp[12]);\
+sbox(s[1], s[5], s[9], s[13], s_temp[1], s_temp[5], s_temp[9], s_temp[13]);\
+sbox(s[2], s[6], s[10], s[14], s_temp[2], s_temp[6], s_temp[10], s_temp[14]);\
+sbox(s[3], s[7], s[11], s[15], s_temp[3], s_temp[7], s_temp[11], s_temp[15]);\
+s[0] = s_temp[0], s[1] = s_temp[1], s[2] = s_temp[2], s[3] = s_temp[3];\
+BIT_LOTR32_1(s_temp[4], s_temp[5], s_temp[6], s_temp[7], s[4], s[5], s[6], s[7]);\
+BIT_LOTR32_16(s_temp[8], s_temp[9], s_temp[10], s_temp[11], s[8], s[9], s[10], s[11]);\
+BIT_LOTR32_25(s_temp[12], s_temp[13], s_temp[14], s_temp[15], s[12], s[13], s[14], s[15]);\
+}
+
--- a/knot/Implementations/crypto_aead/knot256/armcortexm_2/crypto_aead.h
+++ b/knot/Implementations/crypto_aead/knot256/armcortexm_2/crypto_aead.h
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+);
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+);
--- a/knot/Implementations/crypto_aead/knot256/armcortexm_2/encrypt.c
+++ b/knot/Implementations/crypto_aead/knot256/armcortexm_2/encrypt.c
+
+#include"auxFormat.h"
+
+#define aead_RATE (128 / 8)
+#define PR0_ROUNDS 100
+#define PR_ROUNDS 52
+#define PRF_ROUNDS 56
+unsigned char  constant7Format_aead[127] = { 
+	/*constant7_aead_256*/
+0x1,
+0x4,
+0x10,
+0x40,
+0x2,
+0x8,
+0x21,
+0x5,
+0x14,
+0x50,
+0x42,
+0xa,
+0x29,
+0x24,
+0x11,
+0x44,
+0x12,
+0x48,
+0x23,
+0xd,
+0x35,
+0x55,
+0x56,
+0x5a,
+0x6b,
+0x2e,
+0x38,
+0x60,
+0x3,
+0xc,
+0x31,
+0x45,
+0x16,
+0x58,
+0x63,
+0xf,
+0x3d,
+0x74,
+0x53,
+0x4e,
+0x3b,
+0x6c,
+0x32,
+0x49,
+0x27,
+0x1d,
+0x75,
+0x57,
+0x5e,
+0x7b,
+0x6e,
+0x3a,
+0x68,
+0x22,
+0x9,
+0x25,
+0x15,
+0x54,
+0x52,
+0x4a,
+0x2b,
+0x2c,
+0x30,
+0x41,
+0x6,
+0x18,
+0x61,
+0x7,
+0x1c,
+0x71,
+0x47,
+0x1e,
+0x79,
+0x66,
+0x1b,
+0x6d,
+0x36,
+0x59,
+0x67,
+0x1f,
+0x7d,
+0x76,
+0x5b,
+0x6f,
+0x3e,
+0x78,
+0x62,
+0xb,
+0x2d,
+0x34,
+0x51,
+0x46,
+0x1a,
+0x69,
+0x26,
+0x19,
+0x65,
+0x17,
+0x5c,
+0x73,
+0x4f,
+0x3f,
+0x7c,
+0x72,
+0x4b,
+0x2f,
+0x3c,
+0x70,
+0x43,
+0xe,
+0x39,
+0x64,
+0x13,
+0x4c,
+0x33,
+0x4d,
+0x37,
+0x5d,
+0x77,
+0x5f,
+0x7f,
+0x7e,
+0x7a,
+0x6a,
+0x2a,
+0x28,
+0x20, 
+};
+
+
+
+
+/* State
+ * w12 w8  w4  w0
+ * w13 w9  w5  w1
+ * w14 w10 w6  w2
+ * w15 w11 w7  w3
+ *
+ */
+ static void permutation512(unsigned int *in, int rounds, unsigned char *rc) {
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint32_t w8, w9, w10, w11, w12, w13, w14, w15;
+	uint32_t s0, s1, s2,s3;
+	uint32_t i=0;
+	__asm volatile(
+		"ldr     w0,     [in]          \n\t"
+		"ldr     w4,     [in, #4]      \n\t"
+		"ldr     w8,     [in, #8]      \n\t"
+		"ldr     w12,    [in, #12]     \n\t"
+		"ldr     w1,     [in, #16]     \n\t"
+		"ldr     w5,     [in, #20]     \n\t"
+		"ldr     w9,     [in, #24]     \n\t"
+		"ldr     w13,    [in, #28]     \n\t"
+		"ldr     w2,     [in, #32]     \n\t"
+		"ldr     w6,     [in, #36]     \n\t"
+		"ldr     w10,    [in, #40]     \n\t"
+		"ldr     w14,    [in, #44]     \n\t"
+		"ldr     w3,     [in, #48]     \n\t"
+		"ldr     w7,     [in, #52]     \n\t"
+		"ldr     w11,    [in, #56]     \n\t"
+		"ldr     w15,    [in, #60]     \n\t"
+	"enc_loop:                       \n\t"
+    "/*add round const   s0 s1 s2 s3*/           \n\t"
+		"ldrb    s3,     [rc]          \n\t"	
+
+		"LSR     s0,     s3, #6       \n\t"	
+		"and    s0,     s0, 0x3        \n\t"
+
+		"LSR     s1,     s3, #4       \n\t"	
+		"and    s1,     s1, 0x3        \n\t"
+
+		"LSR     s2,     s3, #2       \n\t"	
+		"and    s2,     s2, 0x3        \n\t"
+
+		"and    s3,     s3, 0x3        \n\t"
+
+	  "eors    w12,     w12, s0        \n\t"
+	  "eors    w8,     w8, s1        \n\t"
+	  "eors    w4,     w4, s2        \n\t"
+	  "eors    w0,     w0, s3        \n\t"
+  "/*sbox first column*/         \n\t"
+		"mvns    w0,     w0            \n\t"
+		"ands    s0,     w1, w0        \n\t"
+		"eors    s0,     w2, s0        \n\t"
+		"orrs    w2,     w1, w2        \n\t"
+		"eors    w0,     w3, w0        \n\t"
+		"eors    w2,     w2, w0        \n\t"
+		"eors    s1,     w1, w3        \n\t"
+		"eors    w3,     w3, s0        \n\t"
+		"ands    w0,     s0, w0        \n\t"
+		"eors    w0,     s1, w0        \n\t"
+		"ands    w1,     w2, s1        \n\t"
+		"eors    w1,     s0, w1        \n\t"
+		"/*sbox second column*/        \n\t"
+		"mvns    w4,     w4            \n\t"
+		"ands    s0,     w5, w4        \n\t"
+		"eors    s0,     w6, s0        \n\t"
+		"orrs    w6,     w5, w6        \n\t"
+		"eors    w4,     w7, w4        \n\t"
+		"eors    w6,     w6, w4        \n\t"
+		"eors    s1,     w5, w7        \n\t"
+		"eors    w7,     w7, s0        \n\t"
+		"ands    w4,     s0, w4        \n\t"
+		"eors    w4,     s1, w4        \n\t"
+		"ands    w5,     w6, s1        \n\t"
+		"eors    w5,     s0, w5        \n\t"
+		"/*sbox third column*/         \n\t"
+		"mvns    w8,     w8            \n\t"
+		"ands    s0,     w9,  w8       \n\t"
+		"eors    s0,     w10, s0       \n\t"
+		"orrs    w10,    w9,  w10      \n\t"
+		"eors    w8,     w11, w8       \n\t"
+		"eors    w10,    w10, w8       \n\t"
+		"eors    s1,     w9,  w11      \n\t"
+		"eors    w11,    w11, s0       \n\t"
+		"ands    w8,     s0,  w8       \n\t"
+		"eors    w8,     s1,  w8       \n\t"
+		"ands    w9,     w10, s1       \n\t"
+		"eors    w9,     s0,  w9       \n\t"
+		"/*sbox forth column*/         \n\t"
+		"mvns    w12,    w12           \n\t"
+		"ands    s0,     w13, w12      \n\t"
+		"eors    s0,     w14, s0       \n\t"
+		"orrs    w14,    w13, w14      \n\t"
+		"eors    w12,    w15, w12      \n\t"
+		"eors    w14,    w14, w12      \n\t"
+		"eors    s1,     w13, w15      \n\t"
+		"eors    w15,    w15, s0       \n\t"
+		"ands    w12,    s0,  w12      \n\t"
+		"eors    w12,    s1,  w12      \n\t"
+		"ands    w13,    w14, s1       \n\t"
+		"eors    w13,    s0,  w13      \n\t"
+    "/*rotate shift left 1 bit  [ w13 w9 w5 w1->  w9 w5 w1 (w13,1)] */   \n\t"
+		"mov    s0,     w13       \n\t"
+		"mov    w13,    w9        \n\t"
+		"mov    w9,     w5       \n\t"
+		"mov    w5,     w1       \n\t"
+		"ROR    w1,     s0 , #31      \n\t"
+    "/*rotate shift left 8 bits [w14 w10 w6 w2->w14,4)   w10,4)  (w6,4)  ( w2,4)]*/  \n\t"
+		"ROR    w14,    w14 , #28      \n\t"
+		"ROR    w10,    w10 , #28      \n\t"
+		"ROR    w6,     w6  , #28      \n\t"
+		"ROR    w2,     w2  , #28        \n\t"
+    "/*rotate shift left 25 bit  [w15 w11 w7 w3-> w11,13)  (w7,14)  ( w3,14)  ( w15,14)] */   \n\t"
+		"mov    s0,     w15       \n\t"
+		"ROR    w15,    w11  , #26      \n\t"
+		"ROR    w11,    w7   , #26      \n\t"
+		"ROR    w7 ,    w3   , #26      \n\t"
+		"ROR    w3 ,    s0   , #25      \n\t"
+		"/*loop control*/              \n\t"
+ 		"adds    rc,     rc,  #1       \n\t"
+		"subs    rounds, rounds, #1    \n\t"
+		"bne     enc_loop              \n\t"
+		"str     w0,     [in]          \n\t"
+		"str     w4,     [in, #4]      \n\t"
+		"str     w8,     [in, #8]      \n\t"
+		"str     w12,    [in, #12]     \n\t"
+		"str     w1,     [in, #16]     \n\t"
+		"str     w5,     [in, #20]     \n\t"
+		"str     w9,     [in, #24]     \n\t"
+		"str     w13,    [in, #28]     \n\t"
+		"str     w2,     [in, #32]     \n\t"
+		"str     w6,     [in, #36]     \n\t"
+		"str     w10,    [in, #40]     \n\t"
+		"str     w14,    [in, #44]     \n\t"
+		"str     w3,     [in, #48]     \n\t"
+		"str     w7,     [in, #52]     \n\t"
+		"str     w11,    [in, #56]     \n\t"
+		"str     w15,    [in, #60]     \n\t"
+	);
+}
+
+
+int crypto_aead_encrypt(
+	unsigned char *c, unsigned long long *clen,
+	const unsigned char *m, unsigned long long mlen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *nsec,
+	const unsigned char *npub,
+	const unsigned char *k
+) {
+	u32 i, j;
+	u32 s_temp[16] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+	// initialization
+	u32 s[16] = { 0 };
+	u32 dataFormat[4] = { 0 };
+	u8 tempData[16] = {0};
+	*clen = mlen + CRYPTO_ABYTES;
+	//initialization
+	packU128FormatToFourPacket(s, npub);
+	packU128FormatToFourPacket((s + 4), (npub + 16));
+	packU128FormatToFourPacket((s + 8), k);
+	packU128FormatToFourPacket((s + 12), (k + 16));
+	permutation512(s,PR0_ROUNDS,constant7Format_aead);
+	// process associated data
+	if (adlen) {
+		while (adlen >= aead_RATE) {
+			packU128FormatToFourPacket(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+			s[3] ^= dataFormat[3];
+	permutation512(s,PR_ROUNDS,constant7Format_aead);
+			adlen -= aead_RATE;
+			ad += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, ad, adlen * sizeof(unsigned char));
+		tempData[adlen] = 0x01;
+		packU128FormatToFourPacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		s[3] ^= dataFormat[3];
+	permutation512(s,PR_ROUNDS,constant7Format_aead);
+	}
+	s[15] ^= 0x80000000;
+	if (mlen) {
+		while (mlen >= aead_RATE) {
+			packU128FormatToFourPacket(dataFormat, m);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+			s[3] ^= dataFormat[3];
+			unpackU128FormatToFourPacket(c, s);
+	permutation512(s,PR_ROUNDS,constant7Format_aead);
+			mlen -= aead_RATE;
+			m += aead_RATE;
+			c += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+		memcpy(tempData, m, mlen * sizeof(unsigned char));
+		tempData[mlen]= 0x01;
+		packU128FormatToFourPacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		s[3] ^= dataFormat[3];
+		unpackU128FormatToFourPacket(tempData, s);
+		memcpy(c, tempData, mlen * sizeof(unsigned char));
+		c += mlen;
+	}
+	// finalization
+	permutation512(s,PRF_ROUNDS,constant7Format_aead);
+	// return tag
+	unpackU128FormatToFourPacket(c, s);
+	unpackU128FormatToFourPacket((c+16), (s+4));
+	return 0;
+}
+
+int crypto_aead_decrypt(
+	unsigned char *m, unsigned long long *mlen,
+	unsigned char *nsec,
+	const unsigned char *c, unsigned long long clen,
+	const unsigned char *ad, unsigned long long adlen,
+	const unsigned char *npub,
+	const unsigned char *k
+){
+	u32 s_temp[16] = { 0 };
+	u32 t1, t2, t3, t5, t6, t8, t9, t11;
+	u8 i, j;
+	// initialization
+	u32 s[16] = { 0 };
+	u32 dataFormat[4] = { 0 };
+	u32 dataFormat_1[4] = { 0 };
+	u32 dataFormat_2[4] = { 0 };
+	u8 tempData[16] = { 0 };
+	u8 tempU8[64] = { 0 };
+	
+	if (clen < CRYPTO_ABYTES)
+		return -1;
+	*mlen = clen - CRYPTO_ABYTES;
+	//initialization
+	packU128FormatToFourPacket(s, npub);
+	packU128FormatToFourPacket((s + 4), (npub + 16));
+	packU128FormatToFourPacket((s + 8), k);
+	packU128FormatToFourPacket((s + 12), (k + 16));
+	permutation512(s,PR0_ROUNDS,constant7Format_aead);
+	// process associated data
+	if (adlen) {
+		while (adlen >= aead_RATE) {
+			packU128FormatToFourPacket(dataFormat, ad);
+			s[0] ^= dataFormat[0];
+			s[1] ^= dataFormat[1];
+			s[2] ^= dataFormat[2];
+			s[3] ^= dataFormat[3];
+	permutation512(s,PR_ROUNDS,constant7Format_aead);
+			adlen -= aead_RATE;
+			ad += aead_RATE;
+		}
+		memset(tempData, 0, sizeof(tempData));
+
+		memcpy(tempData, ad, adlen * sizeof(unsigned char));
+		tempData[adlen] = 0x01;
+		packU128FormatToFourPacket(dataFormat, tempData);
+		s[0] ^= dataFormat[0];
+		s[1] ^= dataFormat[1];
+		s[2] ^= dataFormat[2];
+		s[3] ^= dataFormat[3];
+	permutation512(s,PR_ROUNDS,constant7Format_aead);
+	}
+	s[15] ^= 0x80000000;
+	clen = clen - CRYPTO_KEYBYTES;
+
+	if (clen) {
+		while (clen >= aead_RATE) {
+			packU128FormatToFourPacket(dataFormat_2, c);
+			dataFormat_1[0] = s[0] ^ dataFormat_2[0];
+			dataFormat_1[1] = s[1] ^ dataFormat_2[1];
+			dataFormat_1[2] = s[2] ^ dataFormat_2[2];
+			dataFormat_1[3] = s[3] ^ dataFormat_2[3];
+			unpackU128FormatToFourPacket(m, dataFormat_1);
+			s[0] = dataFormat_2[0];
+			s[1] = dataFormat_2[1];
+			s[2] = dataFormat_2[2];
+			s[3] = dataFormat_2[3];
+	permutation512(s,PR_ROUNDS,constant7Format_aead);
+			clen -= aead_RATE;
+			m += aead_RATE;
+			c += aead_RATE;
+		}
+		unpackU128FormatToFourPacket(tempU8, s);
+		for (i = 0; i < clen; ++i, ++m, ++c)
+		{
+			*m = tempU8[i] ^ *c;
+			tempU8[i] = *c;
+		}
+		tempU8[i] ^= 0x01;
+		packU128FormatToFourPacket(s, tempU8);
+	}
+	// finalization
+	permutation512(s,PRF_ROUNDS,constant7Format_aead);
+	// return tag	
+	packU128FormatToFourPacket(dataFormat, c);
+	packU128FormatToFourPacket(dataFormat_1, (c + 16));
+	if (dataFormat[0] != s[0] || dataFormat[1] != s[1] || dataFormat[2] != s[2] || dataFormat[3] != s[3]
+		|| dataFormat_1[0] != s[4] || dataFormat_1[1] != s[5] || dataFormat_1[2] != s[6] || dataFormat_1[3] != s[7]) {
+		return -1;
+	}
+	return 0;
+
+}
\ No newline at end of file