drygascon add_arm_cortex-m

b9419420 · Sebastien Riou · Enrico Pozzobon · 3f904077 · b9419420 · b9419420
Commit b9419420 authored May 31, 2020 by Sebastien Riou Committed by Enrico Pozzobon May 31, 2020
19 changed files
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.c
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "aead-common.h"
+
+int aead_check_tag
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned size)
+{
+    /* Set "accum" to -1 if the tags match, or 0 if they don't match */
+    int accum = 0;
+    while (size > 0) {
+        accum |= (*tag1++ ^ *tag2++);
+        --size;
+    }
+    accum = (accum - 1) >> 8;
+
+    /* Destroy the plaintext if the tag match failed */
+    while (plaintext_len > 0) {
+        *plaintext++ &= accum;
+        --plaintext_len;
+    }
+
+    /* If "accum" is 0, return -1, otherwise return 0 */
+    return ~accum;
+}
+
+int aead_check_tag_precheck
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned size, int precheck)
+{
+    /* Set "accum" to -1 if the tags match, or 0 if they don't match */
+    int accum = 0;
+    while (size > 0) {
+        accum |= (*tag1++ ^ *tag2++);
+        --size;
+    }
+    accum = ((accum - 1) >> 8) & precheck;
+
+    /* Destroy the plaintext if the tag match failed */
+    while (plaintext_len > 0) {
+        *plaintext++ &= accum;
+        --plaintext_len;
+    }
+
+    /* If "accum" is 0, return -1, otherwise return 0 */
+    return ~accum;
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/aead-common.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LWCRYPTO_AEAD_COMMON_H
+#define LWCRYPTO_AEAD_COMMON_H
+
+#include <stddef.h>
+
+/**
+ * \file aead-common.h
+ * \brief Definitions that are common across AEAD schemes.
+ *
+ * AEAD stands for "Authenticated Encryption with Associated Data".
+ * It is a standard API pattern for securely encrypting and
+ * authenticating packets of data.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief Encrypts and authenticates a packet with an AEAD scheme.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - normally not used by AEAD schemes.
+ * \param npub Points to the public nonce for the packet.
+ * \param k Points to the key to use to encrypt the packet.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ */
+typedef int (*aead_cipher_encrypt_t)
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with an AEAD scheme.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - normally not used by AEAD schemes.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet.
+ * \param k Points to the key to use to decrypt the packet.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ */
+typedef int (*aead_cipher_decrypt_t)
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Hashes a block of input data.
+ *
+ * \param out Buffer to receive the hash output.
+ * \param in Points to the input data to be hashed.
+ * \param inlen Length of the input data in bytes.
+ *
+ * \return Returns zero on success or -1 if there was an error in the
+ * parameters.
+ */
+typedef int (*aead_hash_t)
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Initializes the state for a hashing operation.
+ *
+ * \param state Hash state to be initialized.
+ */
+typedef void (*aead_hash_init_t)(void *state);
+
+/**
+ * \brief Updates a hash state with more input data.
+ *
+ * \param state Hash state to be updated.
+ * \param in Points to the input data to be incorporated into the state.
+ * \param inlen Length of the input data to be incorporated into the state.
+ */
+typedef void (*aead_hash_update_t)
+    (void *state, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Returns the final hash value from a hashing operation.
+ *
+ * \param Hash state to be finalized.
+ * \param out Points to the output buffer to receive the hash value.
+ */
+typedef void (*aead_hash_finalize_t)(void *state, unsigned char *out);
+
+/**
+ * \brief Aborbs more input data into an XOF state.
+ *
+ * \param state XOF state to be updated.
+ * \param in Points to the input data to be absorbed into the state.
+ * \param inlen Length of the input data to be absorbed into the state.
+ *
+ * \sa ascon_xof_init(), ascon_xof_squeeze()
+ */
+typedef void (*aead_xof_absorb_t)
+    (void *state, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Squeezes output data from an XOF state.
+ *
+ * \param state XOF state to squeeze the output data from.
+ * \param out Points to the output buffer to receive the squeezed data.
+ * \param outlen Number of bytes of data to squeeze out of the state.
+ */
+typedef void (*aead_xof_squeeze_t)
+    (void *state, unsigned char *out, unsigned long long outlen);
+
+/**
+ * \brief No special AEAD features.
+ */
+#define AEAD_FLAG_NONE          0x0000
+
+/**
+ * \brief The natural byte order of the AEAD cipher is little-endian.
+ *
+ * If this flag is not present, then the natural byte order of the
+ * AEAD cipher should be assumed to be big-endian.
+ *
+ * The natural byte order may be useful when formatting packet sequence
+ * numbers as nonces.  The application needs to know whether the sequence
+ * number should be packed into the leading or trailing bytes of the nonce.
+ */
+#define AEAD_FLAG_LITTLE_ENDIAN 0x0001
+
+/**
+ * \brief Meta-information about an AEAD cipher.
+ */
+typedef struct
+{
+    const char *name;               /**< Name of the cipher */
+    unsigned key_len;               /**< Length of the key in bytes */
+    unsigned nonce_len;             /**< Length of the nonce in bytes */
+    unsigned tag_len;               /**< Length of the tag in bytes */
+    unsigned flags;                 /**< Flags for extra features */
+    aead_cipher_encrypt_t encrypt;  /**< AEAD encryption function */
+    aead_cipher_decrypt_t decrypt;  /**< AEAD decryption function */
+    unsigned char *expected;        /**< AEAD encryption benchmark expected result */
+} aead_cipher_t;
+
+/**
+ * \brief Meta-information about a hash algorithm that is related to an AEAD.
+ *
+ * Regular hash algorithms should provide the "hash", "init", "update",
+ * and "finalize" functions.  Extensible Output Functions (XOF's) should
+ * proivde the "hash", "init", "absorb", and "squeeze" functions.
+ */
+typedef struct
+{
+    const char *name;           /**< Name of the hash algorithm */
+    size_t state_size;          /**< Size of the incremental state structure */
+    unsigned hash_len;          /**< Length of the hash in bytes */
+    unsigned flags;             /**< Flags for extra features */
+    aead_hash_t hash;           /**< All in one hashing function */
+    aead_hash_init_t init;      /**< Incremental hash/XOF init function */
+    aead_hash_update_t update;  /**< Incremental hash update function */
+    aead_hash_finalize_t finalize; /**< Incremental hash finalize function */
+    aead_xof_absorb_t absorb;   /**< Incremental XOF absorb function */
+    aead_xof_squeeze_t squeeze; /**< Incremental XOF squeeze function */
+
+} aead_hash_algorithm_t;
+
+/**
+ * \brief Check an authentication tag in constant time.
+ *
+ * \param plaintext Points to the plaintext data.
+ * \param plaintext_len Length of the plaintext in bytes.
+ * \param tag1 First tag to compare.
+ * \param tag2 Second tag to compare.
+ * \param tag_len Length of the tags in bytes.
+ *
+ * \return Returns -1 if the tag check failed or 0 if the check succeeded.
+ *
+ * If the tag check fails, then the \a plaintext will also be zeroed to
+ * prevent it from being used accidentally by the application when the
+ * ciphertext was invalid.
+ */
+int aead_check_tag
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned tag_len);
+
+/**
+ * \brief Check an authentication tag in constant time with a previous check.
+ *
+ * \param plaintext Points to the plaintext data.
+ * \param plaintext_len Length of the plaintext in bytes.
+ * \param tag1 First tag to compare.
+ * \param tag2 Second tag to compare.
+ * \param tag_len Length of the tags in bytes.
+ * \param precheck Set to -1 if previous check succeeded or 0 if it failed.
+ *
+ * \return Returns -1 if the tag check failed or 0 if the check succeeded.
+ *
+ * If the tag check fails, then the \a plaintext will also be zeroed to
+ * prevent it from being used accidentally by the application when the
+ * ciphertext was invalid.
+ *
+ * This version can be used to incorporate other information about the
+ * correctness of the plaintext into the final result.
+ */
+int aead_check_tag_precheck
+    (unsigned char *plaintext, unsigned long long plaintext_len,
+     const unsigned char *tag1, const unsigned char *tag2,
+     unsigned tag_len, int precheck);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/api.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/api.h
+#define CRYPTO_KEYBYTES 16
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 16
+#define CRYPTO_ABYTES 16
+#define CRYPTO_NOOVERLAP 1
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.c
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "drygascon.h"
+#include "internal-drysponge.h"
+#include <string.h>
+
+uint8_t drygascon128k32_expected[DRYGASCON128_TAG_SIZE]={0x66,0x5A,0xDE,0x6C,0x0F,0xBD,0x48,0x8C,0x5E,0xA4,0x77,0x5D,0xD6,0x24,0xDA,0xD7};
+
+uint8_t drygascon128k56_expected[DRYGASCON128_TAG_SIZE]={0x7B,0x8B,0x9D,0x58,0xA7,0xF7,0x5F,0x1E,0x56,0x99,0x46,0xD6,0x24,0xC4,0xF7,0x68};
+
+uint8_t drygascon128k16_expected[DRYGASCON128_TAG_SIZE]={0x14,0xA5,0x21,0x17,0xFF,0x52,0x4F,0x7C,0xCB,0xB3,0xEB,0xE4,0x05,0xEF,0x18,0xA4};
+
+const aead_cipher_t const drygascon128k32_cipher = {
+    "DryGASCON128k32",
+    DRYGASCON128_FASTKEY_SIZE,
+    DRYGASCON128_NONCE_SIZE,
+    DRYGASCON128_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128k32_aead_encrypt,
+    drygascon128k32_aead_decrypt,
+    drygascon128k32_expected
+};
+
+const aead_cipher_t const drygascon128_cipher = {
+	"DryGASCON128k32",
+	DRYGASCON128_FASTKEY_SIZE,
+	DRYGASCON128_NONCE_SIZE,
+	DRYGASCON128_TAG_SIZE,
+	AEAD_FLAG_LITTLE_ENDIAN,
+	drygascon128k32_aead_encrypt,
+	drygascon128k32_aead_decrypt,
+	drygascon128k32_expected
+};
+
+const aead_cipher_t const drygascon128k56_cipher = {
+    "DryGASCON128k56",
+    DRYGASCON128_SAFEKEY_SIZE,
+    DRYGASCON128_NONCE_SIZE,
+    DRYGASCON128_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128k56_aead_encrypt,
+    drygascon128k56_aead_decrypt,
+    drygascon128k56_expected
+};
+
+const aead_cipher_t const drygascon128k16_cipher = {
+    "DryGASCON128k16",
+    DRYGASCON128_MINKEY_SIZE,
+    DRYGASCON128_NONCE_SIZE,
+    DRYGASCON128_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128k16_aead_encrypt,
+    drygascon128k16_aead_decrypt,
+    drygascon128k16_expected
+};
+
+aead_cipher_t const drygascon256_cipher = {
+    "DryGASCON256",
+    DRYGASCON256_KEY_SIZE,
+    DRYGASCON256_NONCE_SIZE,
+    DRYGASCON256_TAG_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon256_aead_encrypt,
+    drygascon256_aead_decrypt
+};
+
+aead_hash_algorithm_t const drygascon128_hash_algorithm = {
+    "DryGASCON128-HASH",
+    sizeof(int),
+    DRYGASCON128_HASH_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon128_hash,
+    (aead_hash_init_t)0,
+    (aead_hash_update_t)0,
+    (aead_hash_finalize_t)0,
+    (aead_xof_absorb_t)0,
+    (aead_xof_squeeze_t)0
+};
+
+aead_hash_algorithm_t const drygascon256_hash_algorithm = {
+    "DryGASCON256-HASH",
+    sizeof(int),
+    DRYGASCON256_HASH_SIZE,
+    AEAD_FLAG_LITTLE_ENDIAN,
+    drygascon256_hash,
+    (aead_hash_init_t)0,
+    (aead_hash_update_t)0,
+    (aead_hash_finalize_t)0,
+    (aead_xof_absorb_t)0,
+    (aead_xof_squeeze_t)0
+};
+
+/**
+ * \brief Processes associated data for DryGASCON128.
+ *
+ * \param state DrySPONGE128 sponge state.
+ * \param ad Points to the associated data.
+ * \param adlen Length of the associated data, must not be zero.
+ * \param finalize Non-zero to finalize packet processing because
+ * the message is zero-length.
+ */
+static void drygascon128_process_ad
+    (drysponge128_state_t *state, const unsigned char *ad,
+     unsigned long long adlen, int finalize)
+{
+    /* Process all blocks except the last one */
+    while (adlen > DRYSPONGE128_RATE) {
+        drygascon128_f_wrap(state, ad, DRYSPONGE128_RATE);
+        //drysponge128_g_core(state);
+        ad += DRYSPONGE128_RATE;
+        adlen -= DRYSPONGE128_RATE;
+    }
+
+    /* Process the last block with domain separation and padding */
+    state->domain = DRYDOMAIN128_ASSOC_DATA;
+    if (finalize)
+        state->domain |= DRYDOMAIN128_FINAL;
+    if (adlen < DRYSPONGE128_RATE)
+        state->domain |= DRYDOMAIN128_PADDED;
+    drygascon128_f_wrap(state, ad, (unsigned)adlen);
+    //drysponge128_g(state);
+}
+
+/**
+ * \brief Processes associated data for DryGASCON256.
+ *
+ * \param state DrySPONGE256 sponge state.
+ * \param ad Points to the associated data.
+ * \param adlen Length of the associated data, must not be zero.
+ * \param finalize Non-zero to finalize packet processing because
+ * the message is zero-length.
+ */
+static void drygascon256_process_ad
+    (drysponge256_state_t *state, const unsigned char *ad,
+     unsigned long long adlen, int finalize)
+{
+    /* Process all blocks except the last one */
+    while (adlen > DRYSPONGE256_RATE) {
+        drysponge256_f_absorb(state, ad, DRYSPONGE256_RATE);
+        drysponge256_g_core(state);
+        ad += DRYSPONGE256_RATE;
+        adlen -= DRYSPONGE256_RATE;
+    }
+
+    /* Process the last block with domain separation and padding */
+    state->domain = DRYDOMAIN256_ASSOC_DATA;
+    if (finalize)
+        state->domain |= DRYDOMAIN256_FINAL;
+    if (adlen < DRYSPONGE256_RATE)
+        state->domain |= DRYDOMAIN256_PADDED;
+    drysponge256_f_absorb(state, ad, (unsigned)adlen);
+    drysponge256_g(state);
+}
+
+int drygascon128_aead_encrypt_core
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+	 unsigned int keysize,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge128_state_t state;
+    unsigned temp;
+
+    /* Check we are safe */
+	if(!drysponge128_safe_alignement(&state)){
+		return -1;
+	}
+
+    /* Set the length of the returned ciphertext */
+    *clen = mlen + DRYGASCON128_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    drysponge128_setup(&state, k, keysize, npub, adlen == 0 && mlen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon128_process_ad(&state, ad, adlen, mlen == 0);
+
+    /* Encrypt the plaintext to produce the ciphertext */
+    if (mlen > 0) {
+        /* Processs all blocks except the last one */
+        while (mlen > DRYSPONGE128_RATE) {
+            lw_xor_block_2_src(c, m, state.r.B, DRYSPONGE128_RATE);
+            drygascon128_f_wrap(&state, m, DRYSPONGE128_RATE);
+            c += DRYSPONGE128_RATE;
+            m += DRYSPONGE128_RATE;
+            mlen -= DRYSPONGE128_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN128_MESSAGE | DRYDOMAIN128_FINAL;
+        if (mlen < DRYSPONGE128_RATE)
+            state.domain |= DRYDOMAIN128_PADDED;
+        temp = (unsigned)mlen;
+        lw_xor_block_2_src(c, m, state.r.B, temp);
+        drygascon128_f_wrap(&state, m, temp);
+        c += temp;
+    }
+
+    /* Generate the authentication tag */
+    memcpy(c, state.r.B, DRYGASCON128_TAG_SIZE);
+    return 0;
+}
+
+int drygascon128_aead_decrypt_core
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned int keysize,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge128_state_t state;
+    unsigned char *mtemp = m;
+    unsigned temp;
+
+    /* Check we are safe */
+    if(!drysponge128_safe_alignement(&state)){
+		return -1;
+	}
+
+    /* Validate the ciphertext length and set the return "mlen" value */
+    if (clen < DRYGASCON128_TAG_SIZE)
+        return -1;
+    *mlen = clen - DRYGASCON128_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    clen -= DRYGASCON128_TAG_SIZE;
+    drysponge128_setup(&state, k, keysize, npub, adlen == 0 && clen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon128_process_ad(&state, ad, adlen, clen == 0);
+
+    /* Decrypt the ciphertext to produce the plaintext */
+    if (clen > 0) {
+        /* Processs all blocks except the last one */
+        while (clen > DRYSPONGE128_RATE) {
+            lw_xor_block_2_src(m, c, state.r.B, DRYSPONGE128_RATE);
+            drygascon128_f_wrap(&state, m, DRYSPONGE128_RATE);
+            //drysponge128_g(&state);
+            c += DRYSPONGE128_RATE;
+            m += DRYSPONGE128_RATE;
+            clen -= DRYSPONGE128_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN128_MESSAGE | DRYDOMAIN128_FINAL;
+        if (clen < DRYSPONGE128_RATE)
+            state.domain |= DRYDOMAIN128_PADDED;
+        temp = (unsigned)clen;
+        lw_xor_block_2_src(m, c, state.r.B, temp);
+        drygascon128_f_wrap(&state, m, temp);
+        //drysponge128_g(&state);
+        c += temp;
+    }
+
+    /* Check the authentication tag */
+    return aead_check_tag(mtemp, *mlen, state.r.B, c, DRYGASCON128_TAG_SIZE);
+}
+
+int drygascon128k16_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_encrypt_core(c,clen,m,mlen,ad,adlen,16,npub,k);
+}
+
+int drygascon128k32_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_encrypt_core(c,clen,m,mlen,ad,adlen,32,npub,k);
+}
+
+int drygascon128k56_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_encrypt_core(c,clen,m,mlen,ad,adlen,56,npub,k);
+}
+
+
+int drygascon128k16_aead_decrypt
+	(unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_decrypt_core(m,mlen,16,c,clen,ad,adlen,npub,k);
+}
+
+int drygascon128k32_aead_decrypt
+	(unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k){
+	return drygascon128_aead_decrypt_core(m,mlen,32,c,clen,ad,adlen,npub,k);
+}
+
+int drygascon128k56_aead_decrypt
+	(unsigned char *m, unsigned long long *mlen,
+	 unsigned char *nsec,
+	 const unsigned char *c, unsigned long long clen,
+	 const unsigned char *ad, unsigned long long adlen,
+	 const unsigned char *npub,
+	 const unsigned char *k){
+	return drygascon128_aead_decrypt_core(m,mlen,56,c,clen,ad,adlen,npub,k);
+}
+
+int drygascon256_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge256_state_t state;
+    unsigned temp;
+    (void)nsec;
+
+    /* Set the length of the returned ciphertext */
+    *clen = mlen + DRYGASCON256_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    drysponge256_setup(&state, k, npub, adlen == 0 && mlen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon256_process_ad(&state, ad, adlen, mlen == 0);
+
+    /* Encrypt the plaintext to produce the ciphertext */
+    if (mlen > 0) {
+        /* Processs all blocks except the last one */
+        while (mlen > DRYSPONGE256_RATE) {
+            drysponge256_f_absorb(&state, m, DRYSPONGE256_RATE);
+            lw_xor_block_2_src(c, m, state.r.B, DRYSPONGE256_RATE);
+            drysponge256_g(&state);
+            c += DRYSPONGE256_RATE;
+            m += DRYSPONGE256_RATE;
+            mlen -= DRYSPONGE256_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN256_MESSAGE | DRYDOMAIN256_FINAL;
+        if (mlen < DRYSPONGE256_RATE)
+            state.domain |= DRYDOMAIN256_PADDED;
+        temp = (unsigned)mlen;
+        drysponge256_f_absorb(&state, m, temp);
+        lw_xor_block_2_src(c, m, state.r.B, temp);
+        drysponge256_g(&state);
+        c += temp;
+    }
+
+    /* Generate the authentication tag */
+    memcpy(c, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(c + 16, state.r.B, 16);
+    return 0;
+}
+
+int drygascon256_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    drysponge256_state_t state;
+    unsigned char *mtemp = m;
+    unsigned temp;
+    int result;
+    (void)nsec;
+
+    /* Validate the ciphertext length and set the return "mlen" value */
+    if (clen < DRYGASCON256_TAG_SIZE)
+        return -1;
+    *mlen = clen - DRYGASCON256_TAG_SIZE;
+
+    /* Initialize the sponge state with the key and nonce */
+    clen -= DRYGASCON256_TAG_SIZE;
+    drysponge256_setup(&state, k, npub, adlen == 0 && clen == 0);
+
+    /* Process the associated data */
+    if (adlen > 0)
+        drygascon256_process_ad(&state, ad, adlen, clen == 0);
+
+    /* Decrypt the ciphertext to produce the plaintext */
+    if (clen > 0) {
+        /* Processs all blocks except the last one */
+        while (clen > DRYSPONGE256_RATE) {
+            lw_xor_block_2_src(m, c, state.r.B, DRYSPONGE256_RATE);
+            drysponge256_f_absorb(&state, m, DRYSPONGE256_RATE);
+            drysponge256_g(&state);
+            c += DRYSPONGE256_RATE;
+            m += DRYSPONGE256_RATE;
+            clen -= DRYSPONGE256_RATE;
+        }
+
+        /* Process the last block with domain separation and padding */
+        state.domain = DRYDOMAIN256_MESSAGE | DRYDOMAIN256_FINAL;
+        if (clen < DRYSPONGE256_RATE)
+            state.domain |= DRYDOMAIN256_PADDED;
+        temp = (unsigned)clen;
+        lw_xor_block_2_src(m, c, state.r.B, temp);
+        drysponge256_f_absorb(&state, m, temp);
+        drysponge256_g(&state);
+        c += temp;
+    }
+
+    /* Check the authentication tag which is split into two pieces */
+    result = aead_check_tag(0, 0, state.r.B, c, 16);
+    drysponge256_g(&state);
+    return aead_check_tag_precheck
+        (mtemp, *mlen, state.r.B, c + 16, 16, ~result);
+}
+
+/**
+ * \brief Precomputed initialization vector for DryGASCON128-HASH.
+ *
+ * This is the CST_H value from the DryGASCON specification after it
+ * has been processed by the key setup function for DrySPONGE128.
+ */
+static unsigned char const drygascon128_hash_init[] = {
+    /* c */
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    /* x */
+    0xa4, 0x09, 0x38, 0x22, 0x29, 0x9f, 0x31, 0xd0,
+    0x08, 0x2e, 0xfa, 0x98, 0xec, 0x4e, 0x6c, 0x89
+};
+
+int drygascon128_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen)
+{
+    drysponge128_state_t state;
+    memcpy(state.c.B, drygascon128_hash_init, sizeof(state.c.B));
+    memcpy(state.x.B, drygascon128_hash_init + sizeof(state.c.B),
+           sizeof(state.x.B));
+    state.domain = 0;
+    state.rounds = DRYSPONGE128_ROUNDS;
+    drygascon128_process_ad(&state, in, inlen, 1);
+    memcpy(out, state.r.B, 16);
+    drysponge128_g(&state);
+    memcpy(out + 16, state.r.B, 16);
+    return 0;
+}
+
+/**
+ * \brief Precomputed initialization vector for DryGASCON256-HASH.
+ *
+ * This is the CST_H value from the DryGASCON specification after it
+ * has been processed by the key setup function for DrySPONGE256.
+ */
+static unsigned char const drygascon256_hash_init[] = {
+    /* c */
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0xa4, 0x09, 0x38, 0x22, 0x29, 0x9f, 0x31, 0xd0,
+    0x08, 0x2e, 0xfa, 0x98, 0xec, 0x4e, 0x6c, 0x89,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    0x13, 0x19, 0x8a, 0x2e, 0x03, 0x70, 0x73, 0x44,
+    0xa4, 0x09, 0x38, 0x22, 0x29, 0x9f, 0x31, 0xd0,
+    0x08, 0x2e, 0xfa, 0x98, 0xec, 0x4e, 0x6c, 0x89,
+    0x24, 0x3f, 0x6a, 0x88, 0x85, 0xa3, 0x08, 0xd3,
+    /* x */
+    0x45, 0x28, 0x21, 0xe6, 0x38, 0xd0, 0x13, 0x77,
+    0xbe, 0x54, 0x66, 0xcf, 0x34, 0xe9, 0x0c, 0x6c
+};
+
+int drygascon256_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen)
+{
+    drysponge256_state_t state;
+    memcpy(state.c.B, drygascon256_hash_init, sizeof(state.c.B));
+    memcpy(state.x.B, drygascon256_hash_init + sizeof(state.c.B),
+           sizeof(state.x.B));
+    state.domain = 0;
+    state.rounds = DRYSPONGE256_ROUNDS;
+    drygascon256_process_ad(&state, in, inlen, 1);
+    memcpy(out, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(out + 16, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(out + 32, state.r.B, 16);
+    drysponge256_g(&state);
+    memcpy(out + 48, state.r.B, 16);
+    return 0;
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LWCRYPTO_DRYGASCON_H
+#define LWCRYPTO_DRYGASCON_H
+
+#include "aead-common.h"
+
+/**
+ * \file drygascon.h
+ * \brief DryGASCON authenticated encryption algorithm.
+ *
+ * DryGASCON is a family of authenticated encryption algorithms based
+ * around a generalised version of the ASCON permutation.  DryGASCON
+ * is designed to provide some protection against power analysis.
+ *
+ * There are four algorithms in the DryGASCON family:
+ *
+ * \li DryGASCON128 is an authenticated encryption algorithm with a
+ * 128-bit key, a 128-bit nonce, and a 128-bit authentication tag.
+ * \li DryGASCON256 is an authenticated encryption algorithm with a
+ * 256-bit key, a 128-bit nonce, and a 128-256 authentication tag.
+ * \li DryGASCON128-HASH is a hash algorithm with a 256-bit output.
+ * \li DryGASCON256-HASH is a hash algorithm with a 512-bit output.
+ *
+ * DryGASCON128 and DryGASCON128-HASH are the primary members of the family.
+ *
+ * References: https://github.com/sebastien-riou/DryGASCON
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief Minimum Size of the key for DryGASCON128.
+ */
+#define DRYGASCON128_MINKEY_SIZE 16
+
+/**
+ * \brief Fast Size of the key for DryGASCON128.
+ */
+#define DRYGASCON128_FASTKEY_SIZE 32
+
+/**
+ * \brief Safe (and fast) Size of the key for DryGASCON128.
+ * Safe here means the size of the key helps prevent SPA during key loading
+ */
+#define DRYGASCON128_SAFEKEY_SIZE 56
+
+/**
+ * \brief Size of the key for DryGASCON128 (default to "fast" size).
+ */
+#define DRYGASCON128_KEY_SIZE DRYGASCON128_FASTKEY_SIZE
+
+/**
+ * \brief Size of the authentication tag for DryGASCON128.
+ */
+#define DRYGASCON128_TAG_SIZE 16
+
+/**
+ * \brief Size of the nonce for DryGASCON128.
+ */
+#define DRYGASCON128_NONCE_SIZE 16
+
+/**
+ * \brief Size of the hash output for DryGASCON128-HASH.
+ */
+#define DRYGASCON128_HASH_SIZE 32
+
+/**
+ * \brief Size of the key for DryGASCON256.
+ */
+#define DRYGASCON256_KEY_SIZE 32
+
+/**
+ * \brief Size of the authentication tag for DryGASCON256.
+ */
+#define DRYGASCON256_TAG_SIZE 32
+
+/**
+ * \brief Size of the nonce for DryGASCON256.
+ */
+#define DRYGASCON256_NONCE_SIZE 16
+
+/**
+ * \brief Size of the hash output for DryGASCON256-HASH.
+ */
+#define DRYGASCON256_HASH_SIZE 64
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher with 32 bytes key.
+ */
+extern aead_cipher_t const drygascon128k32_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher with 56 bytes key.
+ */
+extern aead_cipher_t const drygascon128k56_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher with 16 bytes key.
+ */
+extern aead_cipher_t const drygascon128k16_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON128 cipher (default to 32 bytes key).
+ */
+extern aead_cipher_t const drygascon128_cipher;
+
+/**
+ * \brief Meta-information block for the DryGASCON256 cipher.
+ */
+extern aead_cipher_t const drygascon256_cipher;
+
+/**
+ * \brief Meta-information block for DryGASCON128-HASH.
+ */
+extern aead_hash_algorithm_t const drygascon128_hash_algorithm;
+
+/**
+ * \brief Meta-information block for DryGASCON256-HASH.
+ */
+extern aead_hash_algorithm_t const drygascon256_hash_algorithm;
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON128 with 32 bytes key.
+ *
+ *	Use this key size if SPA attacks are not a concern in your use case.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 32 bytes of the key to use to encrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon128k32_aead_decrypt()
+ */
+int drygascon128k32_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON128 with 32 bytes key.
+ *
+ *	Use this key size if SPA attacks are not a concern in your use case.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 32 bytes of the key to use to decrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon128k32_aead_encrypt()
+ */
+int drygascon128k32_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON128 with 56 bytes key.
+ *
+ *	Use this key size if you want to prevent SPA attacks
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 56 bytes of the key to use to encrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon128k56_aead_decrypt()
+ */
+int drygascon128k56_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON128 with 56 bytes key.
+ *
+ *	Use this key size if you want to prevent SPA attacks
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 56 bytes of the key to use to decrypt the packet.
+ *
+ * Note that the function blocks if the 16 last bytes of the key are "invalid".
+ * Here "invalid" means that 32 bit words shall be different from each other.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon128k56_aead_encrypt()
+ */
+int drygascon128k56_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON128 with 16 bytes key.
+ *
+ *	Use this key size only if you really cannot use the 32 bytes key.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to encrypt the packet.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon128k16_aead_decrypt()
+ */
+int drygascon128k16_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON128 with 16 bytes key.
+ *
+ *	Use this key size only if you really cannot use the 32 bytes key.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to decrypt the packet.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon128k16_aead_encrypt()
+ */
+int drygascon128k16_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Encrypts and authenticates a packet with DryGASCON256.
+ *
+ * \param c Buffer to receive the output.
+ * \param clen On exit, set to the length of the output which includes
+ * the ciphertext and the 16 byte authentication tag.
+ * \param m Buffer that contains the plaintext message to encrypt.
+ * \param mlen Length of the plaintext message in bytes.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to encrypt the packet.
+ *
+ * \return 0 on success, or a negative value if there was an error in
+ * the parameters.
+ *
+ * \sa drygascon256_aead_decrypt()
+ */
+int drygascon256_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Decrypts and authenticates a packet with DryGASCON256.
+ *
+ * \param m Buffer to receive the plaintext message on output.
+ * \param mlen Receives the length of the plaintext message on output.
+ * \param nsec Secret nonce - not used by this algorithm.
+ * \param c Buffer that contains the ciphertext and authentication
+ * tag to decrypt.
+ * \param clen Length of the input data in bytes, which includes the
+ * ciphertext and the 16 byte authentication tag.
+ * \param ad Buffer that contains associated data to authenticate
+ * along with the packet but which does not need to be encrypted.
+ * \param adlen Length of the associated data in bytes.
+ * \param npub Points to the public nonce for the packet which must
+ * be 16 bytes in length.
+ * \param k Points to the 16 bytes of the key to use to decrypt the packet.
+ *
+ * \return 0 on success, -1 if the authentication tag was incorrect,
+ * or some other negative number if there was an error in the parameters.
+ *
+ * \sa drygascon256_aead_encrypt()
+ */
+int drygascon256_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k);
+
+/**
+ * \brief Hashes a block of input data with DRYGASCON128.
+ *
+ * \param out Buffer to receive the hash output which must be at least
+ * DRYGASCON128_HASH_SIZE bytes in length.
+ * \param in Points to the input data to be hashed.
+ * \param inlen Length of the input data in bytes.
+ *
+ * \return Returns zero on success or -1 if there was an error in the
+ * parameters.
+ */
+int drygascon128_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen);
+
+/**
+ * \brief Hashes a block of input data with DRYGASCON256.
+ *
+ * \param out Buffer to receive the hash output which must be at least
+ * DRYGASCON256_HASH_SIZE bytes in length.
+ * \param in Points to the input data to be hashed.
+ * \param inlen Length of the input data in bytes.
+ *
+ * \return Returns zero on success or -1 if there was an error in the
+ * parameters.
+ */
+int drygascon256_hash
+    (unsigned char *out, const unsigned char *in, unsigned long long inlen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
+/**
+DryGascon128 'v6m implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M0 (Size and Speed)
+*/
+
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m0
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v6m
+.global	drygascon128_f_v6m
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+.type	drygascon128_g_v6m, %function
+drygascon128_g_v6m:
+	//r0: state: c,r,x
+	//r1: rounds
+	push	{r4, r5, r6, r7, lr}
+	//stack vars:
+    // 8 round
+	// 4 rounds
+	// 0 state address
+
+    //r=0
+    movs    r5,#0
+    str     r5,[r0,#R32_0]
+    str     r5,[r0,#R32_1]
+    str     r5,[r0,#R32_2]
+    str     r5,[r0,#R32_3]
+
+    //round=r5=rounds-1;
+    subs    r6,r1,#1
+    //base = round_cst+12-rounds
+    adr		r5, round_cst
+    adds    r5,r5,#12
+    subs    r5,r5,r1
+
+    push	{r0,r5,r6}
+
+	ldr		r4,[r0,#C4L]
+	ldr		r3,[r0,#C3L]
+	ldr		r2,[r0,#C2L]
+	ldr		r1,[r0,#C1L]
+	ldr		r0,[r0,#C0L]
+
+    //loop entry
+	//assume r1>0 at entry
+drygascon128_g_v6m_main_loop:
+    //r0~r4: lower half of each words of the state
+    //r5: base for round constants
+    //r6: round, counting from rounds-1 to 0
+
+    //r6 = ((0xf - r6) << 4) | r6;
+    ldrb 	r6,[r5,r6]
+        // addition of round constant
+    //r2 ^= r6;
+    eors	r2,r2,r6
+
+    // substitution layer, lower half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    ldr		r7,[sp,#0]
+    str		r4,[r7,#C4L]
+    str		r3,[r7,#C3L]
+    str		r2,[r7,#C2L]
+    str		r1,[r7,#C1L]
+    str		r0,[r7,#C0L]
+
+    ldr		r4,[r7,#C4H]
+    ldr		r3,[r7,#C3H]
+    ldr		r2,[r7,#C2H]
+    ldr		r1,[r7,#C1H]
+    ldr		r0,[r7,#C0H]
+
+    // substitution layer, upper half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    // linear diffusion layer
+    ldr		r7,[sp,#0]
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    movs    r6,r4
+    movs    r5,#(20)
+    rors    r4,r4,r5
+    eors    r6,r6,r4
+    ldr     r5,[r7,#C4L]
+    movs    r7,#(4)
+    rors    r5,r5,r7
+    eors    r6,r6,r5
+    ldr		r7,[sp,#0]
+    str     r6,[r7,#C4H]
+    //c4 low part
+    movs    r7,#(32-4)
+    rors    r5,r5,r7
+    movs    r6,r5
+    movs    r7,#((32-20+3)%32)
+    rors    r4,r4,r7
+    eors    r4,r4,r6
+    movs    r7,#(20)
+    rors    r5,r5,r7
+    eors    r4,r4,r5
+    ldr		r7,[sp,#0]
+    str     r4,[r7,#C4L]
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    movs    r6,r0
+    movs    r5,#(14)
+    rors    r0,r0,r5
+    eors    r6,r6,r0
+    ldr     r5,[r7,#C0L]
+    movs    r4,#(10)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C0H]
+    ldr    r4,[r7,#R32_1]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_1]
+    //c0 low part
+    movs    r4,#(32-10)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-14+9)%32)
+    rors    r0,r0,r4
+    eors    r0,r0,r6
+    movs    r4,#(14)
+    rors    r5,r5,r4
+    eors    r0,r0,r5
+    ldr    r4,[r7,#R32_0]
+    eors    r4,r4,r0
+    str    r4,[r7,#R32_0]
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    movs    r6,r1
+    movs    r5,#(19)
+    rors    r1,r1,r5
+    eors    r6,r6,r1
+    ldr     r5,[r7,#C1L]
+    movs    r4,#(31)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C1H]
+    ldr    r4,[r7,#R32_3]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_3]
+    //c1 low part
+    movs    r4,#(32-31)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-19+30)%32)
+    rors    r1,r1,r4
+    eors    r1,r1,r6
+    movs    r4,#(19)
+    rors    r5,r5,r4
+    eors    r1,r1,r5
+    ldr    r4,[r7,#R32_2]
+    eors    r4,r4,r1
+    str    r4,[r7,#R32_2]
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    movs    r6,r2
+    movs    r5,#(3)
+    rors    r2,r2,r5
+    eors    r6,r6,r2
+    ldr     r5,[r7,#C2L]
+    movs    r4,#(1)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C2H]
+    ldr    r4,[r7,#R32_0]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_0]
+    //c2 low part
+    movs    r4,#(32-1)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-3+0)%32)
+    rors    r2,r2,r4
+    eors    r2,r2,r6
+    movs    r4,#(3)
+    rors    r5,r5,r4
+    eors    r2,r2,r5
+    ldr    r4,[r7,#R32_3]
+    eors    r4,r4,r2
+    str    r4,[r7,#R32_3]
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    movs    r6,r3
+    movs    r5,#(5)
+    rors    r3,r3,r5
+    eors    r6,r6,r3
+    ldr     r5,[r7,#C3L]
+    movs    r4,#(9)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C3H]
+    ldr    r4,[r7,#R32_2]
+    eors    r4,r4,r6
+    str     r4,[r7,#R32_2]
+    //c3 low part
+    movs    r4,#(32-9)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-5+8)%32)
+    rors    r3,r3,r4
+    eors    r3,r3,r6
+    movs    r4,#(5)
+    rors    r5,r5,r4
+    eors    r3,r3,r5
+    ldr    r4,[r7,#R32_1]
+    eors    r4,r4,r3
+    str    r4,[r7,#R32_1]
+
+    ldr     r4,[r7,#C4L]
+    ldr     r5,[sp,#4]
+
+    ldr		r6,[sp,#8]
+    subs    r6,#1
+    bmi     drygascon128_g_v6m_exit
+
+    str     r6,[sp,#8]
+	b    	drygascon128_g_v6m_main_loop
+drygascon128_g_v6m_exit:
+
+    str		r3,[r7,#C3L]
+	str		r2,[r7,#C2L]
+	str		r1,[r7,#C1L]
+	str		r0,[r7,#C0L]
+
+	add		sp,sp,#12
+	pop 	{r4, r5, r6, r7, pc}
+.size	drygascon128_g_v6m, .-drygascon128_g_v6m
+
+.align 2
+.type	drygascon128_f_v6m, %function
+drygascon128_f_v6m:
+    //r0:state c r x
+    //r1:input -> shall be 32 bit aligned
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, lr}
+
+    //stack frame:
+    //0 ~ 28-1: buf
+    //28 :pointer on c
+    //32 : rounds for g
+    //36 :mix round / g round
+
+    movs    r4,#26
+    push    {r0,r3,r4}
+    sub     sp,sp,#28
+
+    //load 10 bit mask in r4 = 0x3FF
+    movs    r4,#0xFF
+    lsls    r4,r4,#2
+    adds    r4,r4,#3
+
+    movs    r7,#0
+    //r=0
+    str     r7,[r0,#R32_0]
+    str     r7,[r0,#R32_1]
+    str     r7,[r0,#R32_2]
+    str     r7,[r0,#R32_3]
+
+    //r7 = sp
+    add     r7,r7,sp
+
+    ldr     r3,[r1]
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+26]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+24]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+22]
+
+    lsrs    r5,r3,#10
+    ldr     r3,[r1,#4]
+    lsls    r6,r3,#2
+    lsrs    r3,r3,#8
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+20]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+18]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+16]
+
+    lsrs    r5,r3,#10
+    ldr     r3,[r1,#8]
+    lsls    r6,r3,#4
+    lsrs    r3,r3,#6
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+14]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+12]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+10]
+
+    lsrs    r5,r3,#10
+    ldr     r3,[r1,#12]
+    lsls    r6,r3,#6
+    lsrs    r3,r3,#4
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+8]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+6]
+
+    lsrs    r3,r3,#10
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+4]
+
+    lsrs    r5,r3,#10
+    lsls    r6,r2,#8
+    lsrs    r3,r2,#2
+    orrs    r6,r6,r5
+    movs    r5,r4
+    ands    r5,r5,r6
+    strh    r5,[r7,#0+2]
+
+    movs    r5,r4
+    ands    r5,r5,r3
+    strh    r5,[r7,#0+0]
+
+    movs    r7,#26
+
+drygascon128_f_v6m_mix128_main_loop:
+    movs    r6,#0
+    add     r6,r6,sp
+    ldrh 	r6,[r6,r7]
+
+    ldr     r5,[sp,#28]
+    movs    r7,r5
+    adds    r5,r5,#X0
+    movs    r4,#0xc
+
+    lsls    r0,r6,#2
+    ands    r0,r0,r4
+    ldr     r1,[r5,r0]
+    ldr     r0,[r7,#0*8]
+    eors    r0,r0,r1
+
+    lsrs    r1,r6,#0
+    ands    r1,r1,r4
+    ldr     r2,[r5,r1]
+    ldr     r1,[r7,#1*8]
+    eors    r1,r1,r2
+
+    lsrs    r2,r6,#2
+    ands    r2,r2,r4
+    ldr     r3,[r5,r2]
+    ldr     r2,[r7,#2*8]
+    eors    r2,r2,r3
+
+    lsrs    r3,r6,#4
+    ands    r3,r3,r4
+    ldr     r4,[r5,r3]
+    ldr     r3,[r7,#3*8]
+    eors    r3,r3,r4
+
+    lsrs    r4,r6,#6+2
+    lsls    r4,r4,#2
+    ldr     r6,[r5,r4]
+    ldr     r4,[r7,#4*8]
+    eors    r4,r4,r6
+
+    ldr		r6,[sp,#36]
+    subs    r6,#2
+    bpl     drygascon128_f_v6m_mix128_coreround
+    b		drygascon128_f_v6m_mix128_exit
+drygascon128_f_v6m_mix128_coreround:
+    str     r6,[sp,#36]
+
+    movs    r6,#0xf0
+        // addition of round constant
+    //r2 ^= r6;
+    eors	r2,r2,r6
+
+    // substitution layer, lower half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    ldr		r7,[sp,#28]
+    str		r4,[r7,#C4L]
+    str		r3,[r7,#C3L]
+    str		r2,[r7,#C2L]
+    str		r1,[r7,#C1L]
+    str		r0,[r7,#C0L]
+
+    ldr		r4,[r7,#C4H]
+    ldr		r3,[r7,#C3H]
+    ldr		r2,[r7,#C2H]
+    ldr		r1,[r7,#C1H]
+    ldr		r0,[r7,#C0H]
+
+    // substitution layer, upper half
+	eors	r0,r0,r4
+    eors	r4,r4,r3
+    eors	r2,r2,r1
+
+	mvns	r5,r0
+    mvns	r6,r3
+    mvns	r7,r4
+	ands	r5,r5,r1
+    ands	r6,r6,r4
+    eors	r4,r4,r5
+
+    ands	r7,r7,r0
+    mvns	r5,r2
+    ands	r5,r5,r3
+    eors	r3,r3,r7
+
+    mvns	r7,r1
+    ands	r7,r7,r2
+    eors	r2,r2,r6
+
+    eors	r3,r3,r2
+    mvns	r2,r2
+
+    eors	r0,r0,r7
+    eors	r1,r1,r5
+	eors	r1,r1,r0
+    eors	r0,r0,r4
+
+    // linear diffusion layer
+    ldr		r7,[sp,#28]
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    movs    r6,r4
+    movs    r5,#(20)
+    rors    r4,r4,r5
+    eors    r6,r6,r4
+    ldr     r5,[r7,#C4L]
+    movs    r7,#(4)
+    rors    r5,r5,r7
+    eors    r6,r6,r5
+    ldr		r7,[sp,#28]
+    str     r6,[r7,#C4H]
+    //c4 low part
+    movs    r7,#(32-4)
+    rors    r5,r5,r7
+    movs    r6,r5
+    movs    r7,#((32-20+3)%32)
+    rors    r4,r4,r7
+    eors    r4,r4,r6
+    movs    r7,#(20)
+    rors    r5,r5,r7
+    eors    r4,r4,r5
+    ldr		r7,[sp,#28]
+    str     r4,[r7,#C4L]
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    movs    r6,r0
+    movs    r5,#(14)
+    rors    r0,r0,r5
+    eors    r6,r6,r0
+    ldr     r5,[r7,#C0L]
+    movs    r4,#(10)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C0H]
+    //c0 low part
+    movs    r4,#(32-10)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-14+9)%32)
+    rors    r0,r0,r4
+    eors    r0,r0,r6
+    movs    r4,#(14)
+    rors    r5,r5,r4
+    eors    r0,r0,r5
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    movs    r6,r1
+    movs    r5,#(19)
+    rors    r1,r1,r5
+    eors    r6,r6,r1
+    ldr     r5,[r7,#C1L]
+    movs    r4,#(31)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C1H]
+    //c1 low part
+    movs    r4,#(32-31)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-19+30)%32)
+    rors    r1,r1,r4
+    eors    r1,r1,r6
+    movs    r4,#(19)
+    rors    r5,r5,r4
+    eors    r1,r1,r5
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    movs    r6,r2
+    movs    r5,#(3)
+    rors    r2,r2,r5
+    eors    r6,r6,r2
+    ldr     r5,[r7,#C2L]
+    movs    r4,#(1)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C2H]
+    //c2 low part
+    movs    r4,#(32-1)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-3+0)%32)
+    rors    r2,r2,r4
+    eors    r2,r2,r6
+    movs    r4,#(3)
+    rors    r5,r5,r4
+    eors    r2,r2,r5
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    movs    r6,r3
+    movs    r5,#(5)
+    rors    r3,r3,r5
+    eors    r6,r6,r3
+    ldr     r5,[r7,#C3L]
+    movs    r4,#(9)
+    rors    r5,r5,r4
+    eors    r6,r6,r5
+    str     r6,[r7,#C3H]
+    //c3 low part
+    movs    r4,#(32-9)
+    rors    r5,r5,r4
+    movs    r6,r5
+    movs    r4,#((32-5+8)%32)
+    rors    r3,r3,r4
+    eors    r3,r3,r6
+    movs    r4,#(5)
+    rors    r5,r5,r4
+    eors    r3,r3,r5
+
+    str		r3,[r7,#C3L]
+    str		r2,[r7,#C2L]
+    str		r1,[r7,#C1L]
+    str		r0,[r7,#C0L]
+
+    ldr		r7,[sp,#36]
+
+    b    	drygascon128_f_v6m_mix128_main_loop
+drygascon128_f_v6m_mix128_exit:
+    ldr     r7,[sp,#32]
+    //round=r5=rounds-1;
+    subs    r6,r7,#1
+    //base = round_cst+12-rounds
+    adr		r5, round_cst
+    adds    r5,r5,#12
+    subs    r5,r5,r7
+
+    add		sp,sp,#28
+    str     r5,[sp,#4]
+    str     r6,[sp,#8]
+
+    //push    {r0,r1,r2,r3}
+    //ldr     r0,[sp,#16]
+    //bl      print_state
+    //pop     {r0,r1,r2,r3}
+
+    b       drygascon128_g_v6m_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v6m, .-drygascon128_f_v6m
+
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
+/**
+DryGascon128 'v7m implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M7/M4/M3 (Size and Speed)
+Safe against timing attack on X look up operations under
+the following conditions: (safe if at least one line is true)
+- System without cache
+- State stored in non cacheable memory (like DTCM)
+- Cache lines are 16 bytes or larger AND X is 16 bytes aligned
+
+
+Notes:
+- Arm Cortex-M7 Processor Technical Reference Manual Revision r1p2 states
+  that data cache line size is 32 bytes.
+- Microchip app note TB3186 shows that Microchip use 16 bytes cache lines.
+- ST does not give a general statement about cache lines for its products based
+on M3 and M4. That said STM32F411xC/E datasheet (RM0383
+Reference manual) shows data cache lines of 16 bytes.
+- In the unlikely case in which none of the condition can be met,
+the 'v7m_fpu_x' can be used to prevent this attack.
+
+Note that implementation 'v7m_fpu' is faster (but requires FPU).
+*/
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m3
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v7m
+.global	drygascon128_f_v7m
+.global	drygascon128_g0_v7m
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+
+.type	drygascon128_g_v7m, %function
+drygascon128_g_v7m:
+    //r0: state: c,r,x
+    //r1: rounds
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+    //stack vars:
+    // 8 round
+    // 4 rounds (base address for lookups)
+    // 0 state address
+
+    //r=0
+    movs    r10,#0
+    str     r10,[r0,#R32_0]
+    str     r10,[r0,#R32_1]
+    str     r10,[r0,#R32_2]
+    str     r10,[r0,#R32_3]
+
+    //round=r10=rounds-1;
+    subs    r11,r1,#1
+    //base = round_cst+12-rounds
+    adr     r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r1
+
+    push	{r0,r10,r11}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //loop entry
+    //assume r11>0 at entry
+drygascon128_g_v7m_main_loop:
+    //r0~r9: c
+    //r10: base for round constants
+    //r11: round, counting from rounds-1 to 0
+
+    //r11 = ((0xf - r11) << 4) | r11;
+    ldrb    r11,[r10,r11]
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+    //r14: pointer on C
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    ldr     r12,[r14,#R32_1-C0]
+    eors    r12,r12,r1
+    str     r12,[r14,#R32_1-C0]
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+    ldr     r12,[r14,#R32_0-C0]
+    eors    r12,r12,r0
+    str     r12,[r14,#R32_0-C0]
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    ldr     r12,[r14,#R32_3-C0]
+    eors    r12,r12,r3
+    str     r12,[r14,#R32_3-C0]
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+    ldr     r12,[r14,#R32_2-C0]
+    eors    r12,r12,r2
+    str     r12,[r14,#R32_2-C0]
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    ldr     r12,[r14,#R32_0-C0]
+    eors    r12,r12,r5
+    str     r12,[r14,#R32_0-C0]
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+    ldr     r12,[r14,#R32_3-C0]
+    eors    r12,r12,r4
+    str     r12,[r14,#R32_3-C0]
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    ldr     r12,[r14,#R32_2-C0]
+    eors    r12,r12,r7
+    str     r12,[r14,#R32_2-C0]
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+    ldr     r12,[r14,#R32_1-C0]
+    eors    r12,r12,r6
+    str     r12,[r14,#R32_1-C0]
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr     r10,[sp,#4]
+
+    ldr     r11,[sp,#8]
+    subs    r11,#1
+    bmi     drygascon128_g_v7m_exit
+
+    str     r11,[sp,#8]
+    b       drygascon128_g_v7m_main_loop
+drygascon128_g_v7m_exit:
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    add    sp,sp,#12
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g_v7m, .-drygascon128_g_v7m
+
+.align 2
+.type	drygascon128_f_v7m, %function
+drygascon128_f_v7m:
+    //r0:state
+    //r1:input
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //stack frame:
+    //0: pointer on input
+    //4: DS value
+    //8 :pointer on state
+    //12 : rounds for g
+    //16 :mix round / g round
+
+    movs    r10,#0 //init of input bit counter
+    push    {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
+    push    {r1,r2}
+    //r=0
+    str     r10,[r0,#R32_0]
+    str     r10,[r0,#R32_1]
+    str     r10,[r0,#R32_2]
+    str     r10,[r0,#R32_3]
+
+    //Load C
+    adds	r11,r0,#C0
+    LDMIA.W r11,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+
+drygascon128_f_v7m_mix128_main_loop:
+    //r10 is input bit counter
+    ldr     r11,[sp,#0] //r11 is pointer on input
+
+    //r10 r12 shift
+    // 0   0   0
+    // 10  1   2
+    // 20  2   4
+    // 30  3   6
+    // 40  5   0
+    // 50  6   2
+    // 60  7   4
+    // 70  8   6
+    // 80  10  0
+    // 90  11  2
+    // 100 12  4
+    // 110 13  6
+    // 120 15  0
+    // 130 16  2 --> we do that operation for 2 last bits in a special last loop
+
+    cmp     r10,#120
+    bne     drygascon128_f_v7m_mix128_main_loop.regular
+
+    //we execute this only during the pen-ultimate operation
+    //we add the 2 lsb from DS to r14
+    ldrb    r14,[r11,#15]
+    ldr     r10,[sp,#4]
+    lsl     r10,r10,#8
+    eors    r14,r14,r10
+    b       drygascon128_f_v7m_mix128_main_loop.core
+
+drygascon128_f_v7m_mix128_main_loop.regular:
+    //r12 is base byte: byte offset to read from input buffer
+    lsr     r12,r10,#3  //divide by 8 to get base byte
+    //r10 becomes shift
+    lsl     r14,r12,#3
+    sub     r10,r10,r14
+
+    ldr     r14,[r11,r12] //M7 supports unalign access with ldr
+    lsr     r14,r14,r10
+
+drygascon128_f_v7m_mix128_main_loop.core:
+    ldr     r10,[sp,#8]
+    adds    r10,r10,#X0
+    
+    lsls    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r0,r0,r11
+    
+    lsrs    r11,r14,#0
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r2,r2,r11
+    
+    lsrs    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r4,r4,r11
+    
+    lsrs    r11,r14,#4
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r6,r6,r11
+    
+    lsrs    r11,r14,#6
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r8,r8,r11
+
+    ldr		r10,[sp,#16]
+    adds    r10,#10
+    cmp     r10,#140
+    beq     drygascon128_f_v7m_mix128_exit
+drygascon128_f_v7m_mix128_coreround:
+    str     r10,[sp,#16]
+
+    movs    r11,#0xf0
+
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr		r10,[sp,#16]
+    cmp     r10,#130
+    bne     drygascon128_f_v7m_mix128_main_loop
+    //prepare the last loop: load DS 2 msb
+    ldr     r14,[sp,4]
+    lsr     r14,r14,#2
+    b       drygascon128_f_v7m_mix128_main_loop.core
+
+drygascon128_f_v7m_mix128_exit:
+    ldr     r14,[sp,#12]
+    //round=r10=rounds-1;
+    subs    r11,r14,#1
+    //base = round_cst+12-rounds
+    adr		r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r14
+
+    str     r10,[sp,#12]
+    str     r11,[sp,#16]
+
+    ldr		r14,[sp,#8]
+    add		sp,sp,#8
+    b       drygascon128_g_v7m_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v7m, .-drygascon128_f_v7m
+
+.type	drygascon128_g0_v7m, %function
+drygascon128_g0_v7m:
+    //perform a single round without accumulate
+    //r0: state
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //r0~r9: c
+
+    //r11 = ((0xf - 0) << 4) | 0;
+    movs    r11,#0xf0
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g0_v7m, .-drygascon128_g0_v7m
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
+/**
+DryGascon128 'v7m_fpu implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M7/M4/M3 (Size and Speed)
+Safe against timing attack on X look up operations under
+the following conditions: (safe if at least one line is true)
+- System without cache
+- State stored in non cacheable memory (like DTCM)
+- Cache lines are 16 bytes or larger AND X is 16 bytes aligned
+
+
+Notes:
+- Arm Cortex-M7 Processor Technical Reference Manual Revision r1p2 states
+  that data cache line size is 32 bytes.
+- Microchip app note TB3186 shows that Microchip use 16 bytes cache lines.
+- ST does not give a general statement about cache lines for its products based
+on M3 and M4. That said STM32F411xC/E datasheet (RM0383
+Reference manual) shows data cache lines of 16 bytes.
+- In the unlikely case in which none of the condition can be met,
+the 'v7m_fpu_x' can be used to prevent this attack.
+*/
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m3
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v7m_fpu
+.global	drygascon128_f_v7m_fpu
+.global	drygascon128_g0_v7m_fpu
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+
+.type	drygascon128_g_v7m_fpu, %function
+drygascon128_g_v7m_fpu:
+    //r0: state: c,r,x
+    //r1: rounds
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+    //stack vars:
+    // 8 round
+    // 4 rounds (base address for lookups)
+    // 0 state address
+
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //round=r10=rounds-1;
+    subs    r11,r1,#1
+    //base = round_cst+12-rounds
+    adr     r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r1
+
+    push	{r0,r10,r11}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //loop entry
+    //assume r11>0 at entry
+drygascon128_g_v7m_fpu_main_loop:
+    //r0~r9: c
+    //r10: base for round constants
+    //r11: round, counting from rounds-1 to 0
+
+    //r11 = ((0xf - r11) << 4) | r11;
+    ldrb    r11,[r10,r11]
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+    //FPU:
+    //s11 to s14: r
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    vmov r14,S11
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //r14 is R32_1
+    eors    r14,r14,r1
+    vmov r12,S10
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+    //r12 is R32_0
+    eors    r12,r12,r0
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //r12 is R32_0
+    eors    r12,r12,r5
+    vmov S10,r12
+    vmov r12,S13
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+    //r12 is R32_3
+    eors    r12,r12,r4
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //r12 is R32_3
+    eors    r12,r12,r3
+    vmov S13,r12
+    vmov r12,S12
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+    //r12 is R32_2
+    eors    r12,r12,r2
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //r12 is R32_2
+    eors    r12,r12,r7
+    vmov S12,r12
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+    //r14 is R32_1
+    eors    r14,r14,r6
+    vmov S11,r14
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr     r10,[sp,#4]
+
+    ldr     r11,[sp,#8]
+    subs    r11,#1
+    bmi     drygascon128_g_v7m_fpu_exit
+
+    str     r11,[sp,#8]
+    b       drygascon128_g_v7m_fpu_main_loop
+drygascon128_g_v7m_fpu_exit:
+    //update C
+    ldr     r14,[sp,#0]
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //update R
+    ldr        r11,[sp,#0]
+    adds       r11,r11,#R0
+    VSTMIA.F32 r11, {S10,S11,S12,S13}
+
+    add    sp,sp,#12
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g_v7m_fpu, .-drygascon128_g_v7m_fpu
+
+.align 2
+.type	drygascon128_f_v7m_fpu, %function
+drygascon128_f_v7m_fpu:
+    //r0:state
+    //r1:input
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //stack frame:
+    //0: pointer on input
+    //4: DS value
+    //8 :pointer on state
+    //12 : rounds for g
+    //16 :mix round / g round
+
+    movs    r10,#0 //init of input bit counter
+    push    {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
+    push    {r1,r2}
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //Load C
+    adds	r11,r0,#C0
+    LDMIA.W r11,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+
+drygascon128_f_v7m_fpu_mix128_main_loop:
+    //r10 is input bit counter
+    ldr     r11,[sp,#0] //r11 is pointer on input
+
+    //r10 r12 shift
+    // 0   0   0
+    // 10  1   2
+    // 20  2   4
+    // 30  3   6
+    // 40  5   0
+    // 50  6   2
+    // 60  7   4
+    // 70  8   6
+    // 80  10  0
+    // 90  11  2
+    // 100 12  4
+    // 110 13  6
+    // 120 15  0
+    // 130 16  2 --> we do that operation for 2 last bits in a special last loop
+
+    cmp     r10,#120
+    bne     drygascon128_f_v7m_fpu_mix128_main_loop.regular
+
+    //we execute this only during the pen-ultimate operation
+    //we add the 2 lsb from DS to r14
+    ldrb    r14,[r11,#15]
+    ldr     r10,[sp,#4]
+    lsl     r10,r10,#8
+    eors    r14,r14,r10
+    b       drygascon128_f_v7m_fpu_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_mix128_main_loop.regular:
+    //r12 is base byte: byte offset to read from input buffer
+    lsr     r12,r10,#3  //divide by 8 to get base byte
+    //r10 becomes shift
+    lsl     r14,r12,#3
+    sub     r10,r10,r14
+
+    ldr     r14,[r11,r12] //M7 supports unalign access with ldr
+    lsr     r14,r14,r10
+
+drygascon128_f_v7m_fpu_mix128_main_loop.core:
+    ldr     r10,[sp,#8]
+    adds    r10,r10,#X0
+    
+    lsls    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r0,r0,r11
+    
+    lsrs    r11,r14,#0
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r2,r2,r11
+    
+    lsrs    r11,r14,#2
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r4,r4,r11
+    
+    lsrs    r11,r14,#4
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r6,r6,r11
+    
+    lsrs    r11,r14,#6
+    ands    r11,r11,#0xc
+    ldr     r11,[r10,r11]
+    eors    r8,r8,r11
+
+    ldr		r10,[sp,#16]
+    adds    r10,#10
+    cmp     r10,#140
+    beq     drygascon128_f_v7m_fpu_mix128_exit
+drygascon128_f_v7m_fpu_mix128_coreround:
+    str     r10,[sp,#16]
+
+    movs    r11,#0xf0
+
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr		r10,[sp,#16]
+    cmp     r10,#130
+    bne     drygascon128_f_v7m_fpu_mix128_main_loop
+    //prepare the last loop: load DS 2 msb
+    ldr     r14,[sp,4]
+    lsr     r14,r14,#2
+    b       drygascon128_f_v7m_fpu_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_mix128_exit:
+    ldr     r14,[sp,#12]
+    //round=r10=rounds-1;
+    subs    r11,r14,#1
+    //base = round_cst+12-rounds
+    adr		r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r14
+
+    str     r10,[sp,#12]
+    str     r11,[sp,#16]
+
+    add		sp,sp,#8
+    b       drygascon128_g_v7m_fpu_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v7m_fpu, .-drygascon128_f_v7m_fpu
+
+.type	drygascon128_g0_v7m_fpu, %function
+drygascon128_g0_v7m_fpu:
+    //perform a single round without accumulate
+    //r0: state
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //r0~r9: c
+
+    //r11 = ((0xf - 0) << 4) | 0;
+    movs    r11,#0xf0
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g0_v7m_fpu, .-drygascon128_g0_v7m_fpu
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
+/**
+DryGascon128 'v7m_fpu_x implementation'
+Sebastien Riou, May 27th 2020
+
+Implementation optimized for ARM-Cortex-M7/M4/M3 (Size and Speed)
+Include protection against timing attack on X look up operations
+
+Note that implementation 'v7m_fpu' is faster and safe on all Cortex-M7 as of May 2020.
+*/
+#if defined(__DRYGASCON_ARM_SELECTOR_H__)
+.cpu cortex-m7
+.syntax unified
+.code	16
+.thumb_func
+
+.align	1
+.global	drygascon128_g_v7m_fpu_x
+.global	drygascon128_f_v7m_fpu_x
+.global	drygascon128_g0_v7m_fpu_x
+
+    .equ C0, 0
+    .equ C1, C0+8
+    .equ C2, C0+16
+    .equ C3, C0+24
+    .equ C4, C0+32
+    .equ R0, 48
+    .equ R1, R0+8
+    .equ X0, 64
+    .equ X1, X0+8
+
+    .equ X0L, X0
+    .equ X1L, X1
+    .equ C0L, C0
+    .equ C1L, C1
+    .equ C2L, C2
+    .equ C3L, C3
+    .equ C4L, C4
+    .equ R0L, R0
+    .equ R1L, R1
+
+    .equ X0H, X0+4
+    .equ X1H, X1+4
+    .equ C0H, C0+4
+    .equ C1H, C1+4
+    .equ C2H, C2+4
+    .equ C3H, C3+4
+    .equ C4H, C4+4
+    .equ R0H, R0+4
+    .equ R1H, R1+4
+
+    .equ R32_0, R0L
+    .equ R32_1, R0H
+    .equ R32_2, R1L
+    .equ R32_3, R1H
+
+
+
+.type	drygascon128_g_v7m_fpu_x, %function
+drygascon128_g_v7m_fpu_x:
+    //r0: state: c,r,x
+    //r1: rounds
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+    //stack vars:
+    // 8 round
+    // 4 rounds (base address for lookups)
+    // 0 state address
+
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //round=r10=rounds-1;
+    subs    r11,r1,#1
+    //base = round_cst+12-rounds
+    adr     r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r1
+
+    push	{r0,r10,r11}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //loop entry
+    //assume r11>0 at entry
+drygascon128_g_v7m_fpu_x_main_loop:
+    //r0~r9: c
+    //r10: base for round constants
+    //r11: round, counting from rounds-1 to 0
+
+    //r11 = ((0xf - r11) << 4) | r11;
+    ldrb    r11,[r10,r11]
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+    //FPU:
+    //s11 to s14: r
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    vmov r14,S11
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //r14 is R32_1
+    eors    r14,r14,r1
+    vmov r12,S10
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+    //r12 is R32_0
+    eors    r12,r12,r0
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //r12 is R32_0
+    eors    r12,r12,r5
+    vmov S10,r12
+    vmov r12,S13
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+    //r12 is R32_3
+    eors    r12,r12,r4
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //r12 is R32_3
+    eors    r12,r12,r3
+    vmov S13,r12
+    vmov r12,S12
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+    //r12 is R32_2
+    eors    r12,r12,r2
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //r12 is R32_2
+    eors    r12,r12,r7
+    vmov S12,r12
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+    //r14 is R32_1
+    eors    r14,r14,r6
+    vmov S11,r14
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr     r10,[sp,#4]
+
+    ldr     r11,[sp,#8]
+    subs    r11,#1
+    bmi     drygascon128_g_v7m_fpu_x_exit
+
+    str     r11,[sp,#8]
+    b       drygascon128_g_v7m_fpu_x_main_loop
+drygascon128_g_v7m_fpu_x_exit:
+    //update C
+    ldr     r14,[sp,#0]
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //update R
+    ldr        r11,[sp,#0]
+    adds       r11,r11,#R0
+    VSTMIA.F32 r11, {S10,S11,S12,S13}
+
+    add    sp,sp,#12
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g_v7m_fpu_x, .-drygascon128_g_v7m_fpu_x
+
+.align 2
+.type	drygascon128_f_v7m_fpu_x, %function
+drygascon128_f_v7m_fpu_x:
+    //r0:state
+    //r1:input
+    //r2:ds
+    //r3:rounds
+    push	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //stack frame:
+    //0: pointer on input
+    //4: DS value
+    //8 :pointer on state
+    //12 : rounds for g
+    //16 :mix round / g round
+
+    movs    r10,#0 //init of input bit counter
+    push    {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
+    push    {r1,r2}
+    //r=0
+    VSUB.F32 S10, S10, S10
+    VSUB.F32 S11, S11, S11
+    VSUB.F32 S12, S12, S12
+    VSUB.F32 S13, S13, S13
+
+    //Load C
+    adds	r11,r0,#C0
+    LDMIA.W r11,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //Load X
+    adds       r11,#X0
+    VLDMIA.F32 r11, {s0,s1,s2,s3}
+
+drygascon128_f_v7m_fpu_x_mix128_main_loop:
+    //r10 is input bit counter
+    ldr     r11,[sp,#0] //r11 is pointer on input
+
+    //r10 r12 shift
+    // 0   0   0
+    // 10  1   2
+    // 20  2   4
+    // 30  3   6
+    // 40  5   0
+    // 50  6   2
+    // 60  7   4
+    // 70  8   6
+    // 80  10  0
+    // 90  11  2
+    // 100 12  4
+    // 110 13  6
+    // 120 15  0
+    // 130 16  2 --> we do that operation for 2 last bits in a special last loop
+
+    cmp     r10,#120
+    bne     drygascon128_f_v7m_fpu_x_mix128_main_loop.regular
+
+    //we execute this only during the pen-ultimate operation
+    //we add the 2 lsb from DS to r14
+    ldrb    r14,[r11,#15]
+    ldr     r10,[sp,#4]
+    lsl     r10,r10,#8
+    eors    r14,r14,r10
+    b       drygascon128_f_v7m_fpu_x_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_x_mix128_main_loop.regular:
+    //r12 is base byte: byte offset to read from input buffer
+    lsr     r12,r10,#3  //divide by 8 to get base byte
+    //r10 becomes shift
+    lsl     r14,r12,#3
+    sub     r10,r10,r14
+
+    ldr     r14,[r11,r12] //M7 supports unalign access with ldr
+    lsr     r14,r14,r10
+
+drygascon128_f_v7m_fpu_x_mix128_main_loop.core:
+    
+    tst     r14,#2
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#1
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r0,r0,r11
+    
+    tst     r14,#8
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#4
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r2,r2,r11
+    
+    tst     r14,#32
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#16
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r4,r4,r11
+    
+    tst     r14,#128
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#64
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r6,r6,r11
+    
+    tst     r14,#512
+    VSELEQ.F64 D2, D0, D1
+    tst     r14,#256
+    VSELEQ.F32 S6, S4, S5
+    VMOV    r11,S6
+    eors    r8,r8,r11
+
+    ldr		r10,[sp,#16]
+    adds    r10,#10
+    cmp     r10,#140
+    beq     drygascon128_f_v7m_fpu_x_mix128_exit
+drygascon128_f_v7m_fpu_x_mix128_coreround:
+    str     r10,[sp,#16]
+
+    movs    r11,#0xf0
+
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    ldr		r10,[sp,#16]
+    cmp     r10,#130
+    bne     drygascon128_f_v7m_fpu_x_mix128_main_loop
+    //prepare the last loop: load DS 2 msb
+    ldr     r14,[sp,4]
+    lsr     r14,r14,#2
+    b       drygascon128_f_v7m_fpu_x_mix128_main_loop.core
+
+drygascon128_f_v7m_fpu_x_mix128_exit:
+    ldr     r14,[sp,#12]
+    //round=r10=rounds-1;
+    subs    r11,r14,#1
+    //base = round_cst+12-rounds
+    adr		r10, round_cst
+    adds    r10,r10,#12
+    subs    r10,r10,r14
+
+    str     r10,[sp,#12]
+    str     r11,[sp,#16]
+
+    add		sp,sp,#8
+    b       drygascon128_g_v7m_fpu_x_main_loop
+
+.align 2
+round_cst:
+.byte 0x4b
+.byte 0x5a
+.byte 0x69
+.byte 0x78
+.byte 0x87
+.byte 0x96
+.byte 0xa5
+.byte 0xb4
+.byte 0xc3
+.byte 0xd2
+.byte 0xe1
+.byte 0xf0
+.align 2
+
+.size	drygascon128_f_v7m_fpu_x, .-drygascon128_f_v7m_fpu_x
+
+.type	drygascon128_g0_v7m_fpu_x, %function
+drygascon128_g0_v7m_fpu_x:
+    //perform a single round without accumulate
+    //r0: state
+    push {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+    //Load C
+    adds    r14,r0,#C0
+    LDMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    //r0~r9: c
+
+    //r11 = ((0xf - 0) << 4) | 0;
+    movs    r11,#0xf0
+    //state:
+    //r0 to r9: c
+    //r11: constant to add as round constant
+
+    // addition of round constant
+    //C2L ^= round constant;
+    eors    r4,r4,r11
+
+    // substitution layer, lower half
+    eors    r0,r0,r8
+    eors    r8,r8,r6
+    eors    r4,r4,r2
+    mvns    r10,r0
+    mvns    r11,r6
+    mvns    r12,r8
+    ands    r10,r10,r2
+    ands    r11,r11,r8
+    eors    r8,r8,r10
+    ands    r12,r12,r0
+    mvns    r10,r4
+    ands    r10,r10,r6
+    eors    r6,r6,r12
+    mvns    r12,r2
+    ands    r12,r12,r4
+    eors    r4,r4,r11
+    eors    r6,r6,r4
+    mvns    r4,r4
+    eors    r0,r0,r12
+    eors    r2,r2,r10
+    eors    r2,r2,r0
+    eors    r0,r0,r8
+
+    // substitution layer, upper half
+    eors    r1,r1,r9
+    eors    r9,r9,r7
+    eors    r5,r5,r3
+    mvns    r10,r1
+    mvns    r11,r7
+    mvns    r12,r9
+    ands    r10,r10,r3
+    ands    r11,r11,r9
+    eors    r9,r9,r10
+    ands    r12,r12,r1
+    mvns    r10,r5
+    ands    r10,r10,r7
+    eors    r7,r7,r12
+    mvns    r12,r3
+    ands    r12,r12,r5
+    eors    r5,r5,r11
+    eors    r7,r7,r5
+    mvns    r5,r5
+    eors    r1,r1,r12
+    eors    r3,r3,r10
+    eors    r3,r3,r1
+    eors    r1,r1,r9
+
+
+    // linear diffusion layer
+
+    //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
+    //c4 high part
+    rors    r11,r9,#(20)
+    eors    r9,r11,r9
+    rors    r10,r8,#(4)
+    eors    r9,r10,r9
+    //c4 low part
+    rors    r11,r11,#((32-20+3)%32)
+    eors    r11,r11,r8
+    rors    r10,r8,#(20)
+    eors    r8,r10,r11
+
+    //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
+    //c0 high part
+    rors    r11,r1,#(14)
+    eors    r1,r11,r1
+    rors    r10,r0,#(10)
+    eors    r1,r10,r1
+    //c0 low part
+    rors    r11,r11,#((32-14+9)%32)
+    eors    r11,r11,r0
+    rors    r10,r0,#(14)
+    eors    r0,r10,r11
+
+    //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
+    //c1 high part
+    rors    r11,r3,#(19)
+    eors    r3,r11,r3
+    rors    r10,r2,#(31)
+    eors    r3,r10,r3
+    //c1 low part
+    rors    r11,r11,#((32-19+30)%32)
+    eors    r11,r11,r2
+    rors    r10,r2,#(19)
+    eors    r2,r10,r11
+
+    //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
+    //c2 high part
+    rors    r11,r5,#(3)
+    eors    r5,r11,r5
+    rors    r10,r4,#(1)
+    eors    r5,r10,r5
+    //c2 low part
+    rors    r11,r11,#((32-3+0)%32)
+    eors    r11,r11,r4
+    rors    r10,r4,#(3)
+    eors    r4,r10,r11
+
+    //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
+    //c3 high part
+    rors    r11,r7,#(5)
+    eors    r7,r11,r7
+    rors    r10,r6,#(9)
+    eors    r7,r10,r7
+    //c3 low part
+    rors    r11,r11,#((32-5+8)%32)
+    eors    r11,r11,r6
+    rors    r10,r6,#(5)
+    eors    r6,r10,r11
+
+    //state:
+    //r0 to r9: c
+    //r10,r11,r12 destroyed
+
+    //update C
+    STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
+
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+.size	drygascon128_g0_v7m_fpu_x, .-drygascon128_g0_v7m_fpu_x
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
+#ifndef __DRYGASCON_ARM_SELECTOR_H__
+#define __DRYGASCON_ARM_SELECTOR_H__
+//Optional file to select the best implementation for each chip
+
+#ifdef STM32H743xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+    #define __DRYGASCON_ARM_SELECTOR_FPU__
+#endif
+
+#ifdef STM32F746xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+    #define __DRYGASCON_ARM_SELECTOR_FPU__
+#endif
+
+#ifdef STM32F103xx
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+#endif
+
+#ifdef STM32L011xx
+    #define __DRYGASCON_ARM_SELECTOR_V6M__
+#endif
+
+#ifdef __SAM3X8E__
+    #define __DRYGASCON_ARM_SELECTOR_V7M__
+#endif
+
+//TODO: add more chips here
+
+#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
+    #ifdef __DRYGASCON_ARM_SELECTOR_FPU__
+        #define DRYGASCON_G_OPT   drygascon128_g_v7m_fpu
+        #define DRYGASCON_F_OPT   drygascon128_f_v7m_fpu
+        #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu
+    #else
+        #define DRYGASCON_G_OPT   drygascon128_g_v7m
+        #define DRYGASCON_F_OPT   drygascon128_f_v7m
+        #define DRYGASCON_G0_OPT drygascon128_g0_v7m
+    #endif
+#endif
+
+#ifdef __DRYGASCON_ARM_SELECTOR_V6M__
+        #define DRYGASCON_G_OPT   drygascon128_g_v6m
+        #define DRYGASCON_F_OPT   drygascon128_f_v6m
+        //#define DRYGASCON_G0_OPT drygascon128_g0_v6m
+        #define DRYGASCON_ALIGN_INPUT_32
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
+#include "drygascon.h"
+
+int crypto_aead_encrypt
+    (unsigned char *c, unsigned long long *clen,
+     const unsigned char *m, unsigned long long mlen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *nsec,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    return drygascon128_aead_encrypt
+        (c, clen, m, mlen, ad, adlen, nsec, npub, k);
+}
+
+int crypto_aead_decrypt
+    (unsigned char *m, unsigned long long *mlen,
+     unsigned char *nsec,
+     const unsigned char *c, unsigned long long clen,
+     const unsigned char *ad, unsigned long long adlen,
+     const unsigned char *npub,
+     const unsigned char *k)
+{
+    return drygascon128_aead_decrypt
+        (m, mlen, nsec, c, clen, ad, adlen, npub, k);
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/implementors
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/implementors
+Rhys Weatherley
+Sebastien Riou
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.c
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.c
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "internal-drysponge.h"
+#include <string.h>
+
+#if !defined(__AVR__)
+
+/* Right rotations in bit-interleaved format */
+#define intRightRotateEven(x,bits) \
+    (__extension__ ({ \
+        uint32_t _x0 = (uint32_t)(x); \
+        uint32_t _x1 = (uint32_t)((x) >> 32); \
+        _x0 = rightRotate(_x0, (bits)); \
+        _x1 = rightRotate(_x1, (bits)); \
+        _x0 | (((uint64_t)_x1) << 32); \
+    }))
+#define intRightRotateOdd(x,bits) \
+    (__extension__ ({ \
+        uint32_t _x0 = (uint32_t)(x); \
+        uint32_t _x1 = (uint32_t)((x) >> 32); \
+        _x0 = rightRotate(_x0, ((bits) + 1) % 32); \
+        _x1 = rightRotate(_x1, (bits)); \
+        _x1 | (((uint64_t)_x0) << 32); \
+    }))
+#define intRightRotate1_64(x) \
+    (__extension__ ({ \
+        uint32_t _x0 = (uint32_t)(x); \
+        uint32_t _x1 = (uint32_t)((x) >> 32); \
+        _x0 = rightRotate1(_x0); \
+        _x1 | (((uint64_t)_x0) << 32); \
+    }))
+#define intRightRotate2_64(x)  (intRightRotateEven((x), 1))
+#define intRightRotate3_64(x)  (intRightRotateOdd((x), 1))
+#define intRightRotate4_64(x)  (intRightRotateEven((x), 2))
+#define intRightRotate5_64(x)  (intRightRotateOdd((x), 2))
+#define intRightRotate6_64(x)  (intRightRotateEven((x), 3))
+#define intRightRotate7_64(x)  (intRightRotateOdd((x), 3))
+#define intRightRotate8_64(x)  (intRightRotateEven((x), 4))
+#define intRightRotate9_64(x)  (intRightRotateOdd((x), 4))
+#define intRightRotate10_64(x) (intRightRotateEven((x), 5))
+#define intRightRotate11_64(x) (intRightRotateOdd((x), 5))
+#define intRightRotate12_64(x) (intRightRotateEven((x), 6))
+#define intRightRotate13_64(x) (intRightRotateOdd((x), 6))
+#define intRightRotate14_64(x) (intRightRotateEven((x), 7))
+#define intRightRotate15_64(x) (intRightRotateOdd((x), 7))
+#define intRightRotate16_64(x) (intRightRotateEven((x), 8))
+#define intRightRotate17_64(x) (intRightRotateOdd((x), 8))
+#define intRightRotate18_64(x) (intRightRotateEven((x), 9))
+#define intRightRotate19_64(x) (intRightRotateOdd((x), 9))
+#define intRightRotate20_64(x) (intRightRotateEven((x), 10))
+#define intRightRotate21_64(x) (intRightRotateOdd((x), 10))
+#define intRightRotate22_64(x) (intRightRotateEven((x), 11))
+#define intRightRotate23_64(x) (intRightRotateOdd((x), 11))
+#define intRightRotate24_64(x) (intRightRotateEven((x), 12))
+#define intRightRotate25_64(x) (intRightRotateOdd((x), 12))
+#define intRightRotate26_64(x) (intRightRotateEven((x), 13))
+#define intRightRotate27_64(x) (intRightRotateOdd((x), 13))
+#define intRightRotate28_64(x) (intRightRotateEven((x), 14))
+#define intRightRotate29_64(x) (intRightRotateOdd((x), 14))
+#define intRightRotate30_64(x) (intRightRotateEven((x), 15))
+#define intRightRotate31_64(x) (intRightRotateOdd((x), 15))
+#define intRightRotate32_64(x) (intRightRotateEven((x), 16))
+#define intRightRotate33_64(x) (intRightRotateOdd((x), 16))
+#define intRightRotate34_64(x) (intRightRotateEven((x), 17))
+#define intRightRotate35_64(x) (intRightRotateOdd((x), 17))
+#define intRightRotate36_64(x) (intRightRotateEven((x), 18))
+#define intRightRotate37_64(x) (intRightRotateOdd((x), 18))
+#define intRightRotate38_64(x) (intRightRotateEven((x), 19))
+#define intRightRotate39_64(x) (intRightRotateOdd((x), 19))
+#define intRightRotate40_64(x) (intRightRotateEven((x), 20))
+#define intRightRotate41_64(x) (intRightRotateOdd((x), 20))
+#define intRightRotate42_64(x) (intRightRotateEven((x), 21))
+#define intRightRotate43_64(x) (intRightRotateOdd((x), 21))
+#define intRightRotate44_64(x) (intRightRotateEven((x), 22))
+#define intRightRotate45_64(x) (intRightRotateOdd((x), 22))
+#define intRightRotate46_64(x) (intRightRotateEven((x), 23))
+#define intRightRotate47_64(x) (intRightRotateOdd((x), 23))
+#define intRightRotate48_64(x) (intRightRotateEven((x), 24))
+#define intRightRotate49_64(x) (intRightRotateOdd((x), 24))
+#define intRightRotate50_64(x) (intRightRotateEven((x), 25))
+#define intRightRotate51_64(x) (intRightRotateOdd((x), 25))
+#define intRightRotate52_64(x) (intRightRotateEven((x), 26))
+#define intRightRotate53_64(x) (intRightRotateOdd((x), 26))
+#define intRightRotate54_64(x) (intRightRotateEven((x), 27))
+#define intRightRotate55_64(x) (intRightRotateOdd((x), 27))
+#define intRightRotate56_64(x) (intRightRotateEven((x), 28))
+#define intRightRotate57_64(x) (intRightRotateOdd((x), 28))
+#define intRightRotate58_64(x) (intRightRotateEven((x), 29))
+#define intRightRotate59_64(x) (intRightRotateOdd((x), 29))
+#define intRightRotate60_64(x) (intRightRotateEven((x), 30))
+#define intRightRotate61_64(x) (intRightRotateOdd((x), 30))
+#define intRightRotate62_64(x) (intRightRotateEven((x), 31))
+#define intRightRotate63_64(x) (intRightRotateOdd((x), 31))
+
+#ifdef DRYGASCON_G0_OPT
+void DRYGASCON_G0_OPT(drysponge128_state_t *state);
+static void gascon128_g0(drysponge128_state_t *state){
+	 DRYGASCON_G0_OPT(state);
+}
+#else
+void gascon128_core_round(gascon128_state_t *state, uint8_t round)
+{
+    uint64_t t0, t1, t2, t3, t4;
+
+    /* Load the state into local varaibles */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    uint64_t x0 = state->S[0];
+    uint64_t x1 = state->S[1];
+    uint64_t x2 = state->S[2];
+    uint64_t x3 = state->S[3];
+    uint64_t x4 = state->S[4];
+#else
+    uint64_t x0 = le_load_word64(state->B);
+    uint64_t x1 = le_load_word64(state->B + 8);
+    uint64_t x2 = le_load_word64(state->B + 16);
+    uint64_t x3 = le_load_word64(state->B + 24);
+    uint64_t x4 = le_load_word64(state->B + 32);
+#endif
+
+    /* Add the round constant to the middle of the state */
+    x2 ^= ((0x0F - round) << 4) | round;
+
+    /* Substitution layer */
+    x0 ^= x4; x2 ^= x1; x4 ^= x3; t0 = (~x0) & x1; t1 = (~x1) & x2;
+    t2 = (~x2) & x3; t3 = (~x3) & x4; t4 = (~x4) & x0; x0 ^= t1;
+    x1 ^= t2; x2 ^= t3; x3 ^= t4; x4 ^= t0; x1 ^= x0; x3 ^= x2;
+    x0 ^= x4; x2 = ~x2;
+
+    /* Linear diffusion layer */
+    x0 ^= intRightRotate19_64(x0) ^ intRightRotate28_64(x0);
+    x1 ^= intRightRotate61_64(x1) ^ intRightRotate38_64(x1);
+    x2 ^= intRightRotate1_64(x2)  ^ intRightRotate6_64(x2);
+    x3 ^= intRightRotate10_64(x3) ^ intRightRotate17_64(x3);
+    x4 ^= intRightRotate7_64(x4)  ^ intRightRotate40_64(x4);
+
+    /* Write the local variables back to the state */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    state->S[0] = x0;
+    state->S[1] = x1;
+    state->S[2] = x2;
+    state->S[3] = x3;
+    state->S[4] = x4;
+#else
+    le_store_word64(state->B,      x0);
+    le_store_word64(state->B +  8, x1);
+    le_store_word64(state->B + 16, x2);
+    le_store_word64(state->B + 24, x3);
+    le_store_word64(state->B + 32, x4);
+#endif
+}
+
+static void gascon128_g0(drysponge128_state_t *state){
+	gascon128_core_round(&(state->c), 0);
+}
+#endif
+
+void gascon256_core_round(gascon256_state_t *state, uint8_t round)
+{
+    uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8;
+
+    /* Load the state into local varaibles */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    uint64_t x0 = state->S[0];
+    uint64_t x1 = state->S[1];
+    uint64_t x2 = state->S[2];
+    uint64_t x3 = state->S[3];
+    uint64_t x4 = state->S[4];
+    uint64_t x5 = state->S[5];
+    uint64_t x6 = state->S[6];
+    uint64_t x7 = state->S[7];
+    uint64_t x8 = state->S[8];
+#else
+    uint64_t x0 = le_load_word64(state->B);
+    uint64_t x1 = le_load_word64(state->B + 8);
+    uint64_t x2 = le_load_word64(state->B + 16);
+    uint64_t x3 = le_load_word64(state->B + 24);
+    uint64_t x4 = le_load_word64(state->B + 32);
+    uint64_t x5 = le_load_word64(state->B + 40);
+    uint64_t x6 = le_load_word64(state->B + 48);
+    uint64_t x7 = le_load_word64(state->B + 56);
+    uint64_t x8 = le_load_word64(state->B + 64);
+#endif
+
+    /* Add the round constant to the middle of the state */
+    x4 ^= ((0x0F - round) << 4) | round;
+
+    /* Substitution layer */
+    x0 ^= x8; x2 ^= x1; x4 ^= x3; x6 ^= x5; x8 ^= x7; t0 = (~x0) & x1;
+    t1 = (~x1) & x2; t2 = (~x2) & x3; t3 = (~x3) & x4; t4 = (~x4) & x5;
+    t5 = (~x5) & x6; t6 = (~x6) & x7; t7 = (~x7) & x8; t8 = (~x8) & x0;
+    x0 ^= t1; x1 ^= t2; x2 ^= t3; x3 ^= t4; x4 ^= t5; x5 ^= t6; x6 ^= t7;
+    x7 ^= t8; x8 ^= t0; x1 ^= x0; x3 ^= x2; x5 ^= x4; x7 ^= x6; x0 ^= x8;
+    x4 = ~x4;
+
+    /* Linear diffusion layer */
+    x0 ^= intRightRotate19_64(x0) ^ intRightRotate28_64(x0);
+    x1 ^= intRightRotate61_64(x1) ^ intRightRotate38_64(x1);
+    x2 ^= intRightRotate1_64(x2)  ^ intRightRotate6_64(x2);
+    x3 ^= intRightRotate10_64(x3) ^ intRightRotate17_64(x3);
+    x4 ^= intRightRotate7_64(x4)  ^ intRightRotate40_64(x4);
+    x5 ^= intRightRotate31_64(x5) ^ intRightRotate26_64(x5);
+    x6 ^= intRightRotate53_64(x6) ^ intRightRotate58_64(x6);
+    x7 ^= intRightRotate9_64(x7)  ^ intRightRotate46_64(x7);
+    x8 ^= intRightRotate43_64(x8) ^ intRightRotate50_64(x8);
+
+    /* Write the local variables back to the state */
+#if defined(LW_UTIL_LITTLE_ENDIAN)
+    state->S[0] = x0;
+    state->S[1] = x1;
+    state->S[2] = x2;
+    state->S[3] = x3;
+    state->S[4] = x4;
+    state->S[5] = x5;
+    state->S[6] = x6;
+    state->S[7] = x7;
+    state->S[8] = x8;
+#else
+    le_store_word64(state->B,      x0);
+    le_store_word64(state->B +  8, x1);
+    le_store_word64(state->B + 16, x2);
+    le_store_word64(state->B + 24, x3);
+    le_store_word64(state->B + 32, x4);
+    le_store_word64(state->B + 40, x5);
+    le_store_word64(state->B + 48, x6);
+    le_store_word64(state->B + 56, x7);
+    le_store_word64(state->B + 64, x8);
+#endif
+}
+
+#ifdef DRYGASCON_G_OPT
+void DRYGASCON_G_OPT(uint64_t* state, uint32_t rounds);
+//use state only to access c,r,x
+static void drysponge128_g_impl(drysponge128_state_t *state,unsigned int rounds)
+{
+    DRYGASCON_G_OPT((uint64_t*)state,rounds);
+}
+#else
+
+//use state only to access c,r,x
+static void drysponge128_g_impl(drysponge128_state_t *state,unsigned int rounds)
+{
+    unsigned round;
+
+    /* Perform the first round.  For each round we XOR the 16 bytes of
+     * the output data with the first 16 bytes of the state.  And then
+     * XOR with the next 16 bytes of the state, rotated by 4 bytes */
+    gascon128_core_round(&(state->c), 0);
+    state->r.W[0] = state->c.W[0] ^ state->c.W[5];
+    state->r.W[1] = state->c.W[1] ^ state->c.W[6];
+    state->r.W[2] = state->c.W[2] ^ state->c.W[7];
+    state->r.W[3] = state->c.W[3] ^ state->c.W[4];
+
+    /* Perform the rest of the rounds */
+    for (round = 1; round < rounds; ++round) {
+        gascon128_core_round(&(state->c), round);
+        state->r.W[0] ^= state->c.W[0] ^ state->c.W[5];
+        state->r.W[1] ^= state->c.W[1] ^ state->c.W[6];
+        state->r.W[2] ^= state->c.W[2] ^ state->c.W[7];
+        state->r.W[3] ^= state->c.W[3] ^ state->c.W[4];
+    }
+}
+#endif
+void drysponge128_g(drysponge128_state_t *state)
+{
+    drysponge128_g_impl(state,state->rounds);
+}
+
+void drysponge256_g(drysponge256_state_t *state)
+{
+    unsigned round;
+
+    /* Perform the first round.  For each round we XOR the 16 bytes of
+     * the output data with the first 16 bytes of the state.  And then
+     * XOR with the next 16 bytes of the state, rotated by 4 bytes.
+     * And so on for a total of 64 bytes XOR'ed into the output data. */
+    gascon256_core_round(&(state->c), 0);
+    state->r.W[0] = state->c.W[0]  ^ state->c.W[5] ^
+                    state->c.W[10] ^ state->c.W[15];
+    state->r.W[1] = state->c.W[1]  ^ state->c.W[6] ^
+                    state->c.W[11] ^ state->c.W[12];
+    state->r.W[2] = state->c.W[2]  ^ state->c.W[7] ^
+                    state->c.W[8]  ^ state->c.W[13];
+    state->r.W[3] = state->c.W[3]  ^ state->c.W[4] ^
+                    state->c.W[9]  ^ state->c.W[14];
+
+    /* Perform the rest of the rounds */
+    for (round = 1; round < state->rounds; ++round) {
+        gascon256_core_round(&(state->c), round);
+        state->r.W[0] ^= state->c.W[0]  ^ state->c.W[5] ^
+                         state->c.W[10] ^ state->c.W[15];
+        state->r.W[1] ^= state->c.W[1]  ^ state->c.W[6] ^
+                         state->c.W[11] ^ state->c.W[12];
+        state->r.W[2] ^= state->c.W[2]  ^ state->c.W[7] ^
+                         state->c.W[8]  ^ state->c.W[13];
+        state->r.W[3] ^= state->c.W[3]  ^ state->c.W[4] ^
+                         state->c.W[9]  ^ state->c.W[14];
+    }
+}
+
+#endif /* !__AVR__ */
+
+#ifndef DRYGASCON_G_OPT
+void drysponge128_g_core(drysponge128_state_t *state)
+{
+    unsigned round;
+    for (round = 0; round < state->rounds; ++round)
+        gascon128_core_round(&(state->c), round);
+}
+#endif
+
+void drysponge256_g_core(drysponge256_state_t *state)
+{
+    unsigned round;
+    for (round = 0; round < state->rounds; ++round)
+        gascon256_core_round(&(state->c), round);
+}
+
+/**
+ * \fn uint32_t drysponge_select_x(const uint32_t x[4], uint8_t index)
+ * \brief Selects an element of x in constant time.
+ *
+ * \param x Points to the four elements of x.
+ * \param index Index of which element to extract between 0 and 3.
+ *
+ * \return The selected element of x.
+ */
+#if defined(__HAS_CACHE__)
+STATIC_INLINE uint32_t drysponge_select_x(const uint32_t x[4], uint8_t index)
+{
+    /* We need to be careful how we select each element of x because
+     * we are doing a data-dependent fetch here.  Do the fetch in a way
+     * that should avoid cache timing issues by fetching every element
+     * of x and masking away the ones we don't want.
+     *
+     * There is a possible side channel here with respect to power analysis.
+     * The "mask" value will be all-ones for the selected index and all-zeroes
+     * for the other indexes.  This may show up as different power consumption
+     * for the "result ^= x[i] & mask" statement when i is the selected index.
+     * Such a side channel could in theory allow reading the plaintext input
+     * to the cipher by analysing the CPU's power consumption.
+     *
+     * The DryGASCON specification acknowledges the possibility of plaintext
+     * recovery in section 7.4.  For software mitigation the specification
+     * suggests randomization of the indexes into c and x and randomization
+     * of the order of processing words.  We aren't doing that here yet.
+     * Patches welcome to fix this.
+     */
+    uint32_t mask = -((uint32_t)((0x04 - index) >> 2));
+    uint32_t result = x[0] & mask;
+    mask = -((uint32_t)((0x04 - (index ^ 0x01)) >> 2));
+    result ^= x[1] & mask;
+    mask = -((uint32_t)((0x04 - (index ^ 0x02)) >> 2));
+    result ^= x[2] & mask;
+    mask = -((uint32_t)((0x04 - (index ^ 0x03)) >> 2));
+    return result ^ (x[3] & mask);
+}
+#else
+/* AVR is more or less immune to cache timing issues because it doesn't
+ * have anything like an L1 or L2 cache.  Select the word directly */
+#define drysponge_select_x(x, index) ((x)[(index)])
+#endif
+
+#ifndef DRYGASCON_F_OPT
+/**
+ * \brief Mixes a 32-bit value into the DrySPONGE128 state.
+ *
+ * \param state DrySPONGE128 state.
+ * \param data The data to be mixed in the bottom 10 bits.
+ */
+static void drysponge128_mix_phase_round
+    (drysponge128_state_t *state, uint32_t data)
+{
+    /* Mix in elements from x according to the 2-bit indexes in the data */
+    state->c.W[0] ^= drysponge_select_x(state->x.W, data & 0x03);
+    state->c.W[2] ^= drysponge_select_x(state->x.W, (data >> 2) & 0x03);
+    state->c.W[4] ^= drysponge_select_x(state->x.W, (data >> 4) & 0x03);
+    state->c.W[6] ^= drysponge_select_x(state->x.W, (data >> 6) & 0x03);
+    state->c.W[8] ^= drysponge_select_x(state->x.W, (data >> 8) & 0x03);
+}
+#endif
+
+/**
+ * \brief Mixes a 32-bit value into the DrySPONGE256 state.
+ *
+ * \param state DrySPONGE256 state.
+ * \param data The data to be mixed in the bottom 18 bits.
+ */
+static void drysponge256_mix_phase_round
+    (drysponge256_state_t *state, uint32_t data)
+{
+    /* Mix in elements from x according to the 2-bit indexes in the data */
+    state->c.W[0]  ^= drysponge_select_x(state->x.W, data & 0x03);
+    state->c.W[2]  ^= drysponge_select_x(state->x.W, (data >>  2) & 0x03);
+    state->c.W[4]  ^= drysponge_select_x(state->x.W, (data >>  4) & 0x03);
+    state->c.W[6]  ^= drysponge_select_x(state->x.W, (data >>  6) & 0x03);
+    state->c.W[8]  ^= drysponge_select_x(state->x.W, (data >>  8) & 0x03);
+    state->c.W[10] ^= drysponge_select_x(state->x.W, (data >> 10) & 0x03);
+    state->c.W[12] ^= drysponge_select_x(state->x.W, (data >> 12) & 0x03);
+    state->c.W[14] ^= drysponge_select_x(state->x.W, (data >> 14) & 0x03);
+    state->c.W[16] ^= drysponge_select_x(state->x.W, (data >> 16) & 0x03);
+}
+
+#ifndef DRYGASCON_F_OPT
+/**
+ * \brief Mixes an input block into a DrySPONGE128 state.
+ *
+ * \param state The DrySPONGE128 state.
+ * \param data Full rate block containing the input data.
+ */
+static void drysponge128_mix_phase
+    (drysponge128_state_t *state, const unsigned char data[DRYSPONGE128_RATE],unsigned int ds)
+{
+    /* Mix 10-bit groups into the output, with the domain
+     * separator added to the last two groups */
+    drysponge128_mix_phase_round
+        (state, data[0] | (((uint32_t)(data[1])) << 8));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[1] >> 2) | (((uint32_t)(data[2])) << 6));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[2] >> 4) | (((uint32_t)(data[3])) << 4));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[3] >> 6) | (((uint32_t)(data[4])) << 2));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, data[5] | (((uint32_t)(data[6])) << 8));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[6] >> 2) | (((uint32_t)(data[7])) << 6));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[7] >> 4) | (((uint32_t)(data[8])) << 4));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[8] >> 6) | (((uint32_t)(data[9])) << 2));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, data[10] | (((uint32_t)(data[11])) << 8));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[11] >> 2) | (((uint32_t)(data[12])) << 6));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, (data[12] >> 4) | (((uint32_t)(data[13])) << 4));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round
+        (state, ((data[13] >> 6) | (((uint32_t)(data[14])) << 2)));
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round(state, data[15] ^ ds);
+    gascon128_core_round(&(state->c), 0);
+    drysponge128_mix_phase_round(state, ds >> 10);
+}
+#endif
+
+/**
+ * \brief Mixes an input block into a DrySPONGE256 state.
+ *
+ * \param state The DrySPONGE256 state.
+ * \param data Full rate block containing the input data.
+ */
+static void drysponge256_mix_phase
+    (drysponge256_state_t *state, const unsigned char data[DRYSPONGE256_RATE])
+{
+    /* Mix 18-bit groups into the output, with the domain in the last group */
+    drysponge256_mix_phase_round
+        (state, data[0] | (((uint32_t)(data[1])) << 8) |
+                (((uint32_t)(data[2])) << 16));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[2] >> 2) | (((uint32_t)(data[3])) << 6) |
+                (((uint32_t)(data[4])) << 14));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[4] >> 4) | (((uint32_t)(data[5])) << 4) |
+                (((uint32_t)(data[6])) << 12));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[6] >> 6) | (((uint32_t)(data[7])) << 2) |
+                (((uint32_t)(data[8])) << 10));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, data[9] | (((uint32_t)(data[10])) << 8) |
+                (((uint32_t)(data[11])) << 16));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[11] >> 2) | (((uint32_t)(data[12])) << 6) |
+                (((uint32_t)(data[13])) << 14));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[13] >> 4) | (((uint32_t)(data[14])) << 4) |
+                (((uint32_t)(data[15])) << 12));
+    gascon256_core_round(&(state->c), 0);
+    drysponge256_mix_phase_round
+        (state, (data[15] >> 6) ^ state->domain);
+
+    /* Revert to the default domain separator for the next block */
+    state->domain = 0;
+}
+
+#ifdef DRYGASCON_F_OPT
+void DRYGASCON_F_OPT(drysponge128_state_t *state, const unsigned char *input,unsigned int ds, unsigned int rounds);
+static void drygascon128_f_impl(drysponge128_state_t *state, const unsigned char *input,unsigned int ds, unsigned int rounds){
+    DRYGASCON_F_OPT(state, input, ds, rounds);
+}
+#else
+void drygascon128_f_impl(drysponge128_state_t *state, const unsigned char *input,unsigned int ds, unsigned int rounds){
+    drysponge128_mix_phase(state, input ,ds);
+    drysponge128_g_impl(state,rounds);
+}
+#endif
+void drygascon128_f_wrap(drysponge128_state_t *state, const unsigned char *input, unsigned len){
+    drysponge128_rate_t padded;//enforce alignement (if needed by f_impl)
+    const unsigned char*in;
+    if (len < DRYSPONGE128_RATE) {
+        memcpy(padded.B, input, len);
+        padded.B[len] = 0x01;
+        memset(padded.B + len + 1, 0, DRYSPONGE128_RATE - len - 1);
+        in=padded.B;
+    } else {
+		#ifdef DRYGASCON_ALIGN_INPUT_32
+        memcpy(padded.B,input,DRYSPONGE128_RATE);
+        in=padded.B;
+		#else
+        in=input;
+		#endif
+    }
+    drygascon128_f_impl(state, in,state->domain,state->rounds);
+    /* Revert to the default domain separator for the next block */
+    state->domain = 0;
+}
+
+void drysponge256_f_absorb
+    (drysponge256_state_t *state, const unsigned char *input, unsigned len)
+{
+    if (len >= DRYSPONGE256_RATE) {
+        drysponge256_mix_phase(state, input);
+    } else {
+        unsigned char padded[DRYSPONGE256_RATE];
+        memcpy(padded, input, len);
+        padded[len] = 0x01;
+        memset(padded + len + 1, 0, DRYSPONGE256_RATE - len - 1);
+        drysponge256_mix_phase(state, padded);
+    }
+}
+
+/**
+ * \brief Determine if some of the words of an "x" value are identical.
+ *
+ * \param x Points to the "x" buffer to check.
+ *
+ * \return Non-zero if some of the words are the same, zero if they are
+ * distinct from each other.
+ *
+ * We try to perform the check in constant time to avoid giving away
+ * any information about the value of the key.
+ */
+static int drysponge_x_words_are_same(const uint32_t x[4])
+{
+    unsigned i, j;
+    int result = 0;
+    for (i = 0; i < 3; ++i) {
+        for (j = i + 1; j < 4; ++j) {
+            uint32_t check = x[i] ^ x[j];
+            result |= (int)((0x100000000ULL - check) >> 32);
+        }
+    }
+    return result;
+}
+
+
+int drysponge128_safe_alignement(const drysponge128_state_t*state){
+	return 0==(0xF & (uintptr_t )&(state->x));
+}
+
+void drysponge128_setup
+    (drysponge128_state_t *state, const unsigned char *key, unsigned int keysize,
+     const unsigned char *nonce, int final_block)
+{
+	if(DRYGASCON128_SAFEKEY_SIZE==keysize){
+		/* Fill C and X directly with the key */
+		memcpy(state->c.B, key, sizeof(state->c));
+		memcpy(state->x.B, key+ sizeof(state->c), sizeof(state->x));
+		while (drysponge_x_words_are_same(state->x.W)); //block here if the key is not valid
+
+	} else {
+		/* Fill the GASCON-128 state with repeated copies of the key */
+		memcpy(state->c.B, key, 16);
+		memcpy(state->c.B + 16, key, 16);
+		memcpy(state->c.B + 32, key, 8);
+
+		if(DRYGASCON128_FASTKEY_SIZE==keysize){
+
+			/* Fill X with the 16 last bytes of the key */
+			memcpy(state->x.B, key+16, sizeof(state->x));
+			while (drysponge_x_words_are_same(state->x.W)); //block here if the key is not valid
+
+		} else if(DRYGASCON128_MINKEY_SIZE==keysize){
+
+			/* Generate the "x" value for the state.  All four words of "x"
+			 * must be unique because they will be used in drysponge_select_x()
+			 * as stand-ins for the bit pairs 00, 01, 10, and 11.
+			 *
+			 * Run the core block operation over and over until "x" is unique.
+			 * Technically the runtime here is key-dependent and not constant.
+			 * If the input key is randomized, this should only take 1 round
+			 * on average so it is "almost constant time".
+			 */
+			do {
+				//gascon128_core_round(&(state->c), 0);
+				//drysponge128_g_impl(state,1);
+				gascon128_g0(state);
+			} while (drysponge_x_words_are_same(state->c.W));
+			memcpy(state->x.W, state->c.W, sizeof(state->x));
+
+			/* Replace the generated "x" value in the state with the key prefix */
+			memcpy(state->c.W, key, sizeof(state->x));
+		}
+	}
+
+    /* Absorb the nonce into the state with an increased number of rounds */
+    state->rounds = DRYSPONGE128_INIT_ROUNDS;
+    state->domain = DRYDOMAIN128_NONCE;
+    if (final_block)
+        state->domain |= DRYDOMAIN128_FINAL;
+    drygascon128_f_wrap(state, nonce, 16);
+
+    /* Set up the normal number of rounds for future operations */
+    state->rounds = DRYSPONGE128_ROUNDS;
+}
+
+void drysponge256_setup
+    (drysponge256_state_t *state, const unsigned char *key,
+     const unsigned char *nonce, int final_block)
+{
+    /* Fill the GASCON-256 state with repeated copies of the key */
+    memcpy(state->c.B, key, 32);
+    memcpy(state->c.B + 32, key, 32);
+    memcpy(state->c.B + 64, key, 8);
+
+    /* Generate the "x" value for the state */
+    do {
+        gascon256_core_round(&(state->c), 0);
+    } while (drysponge_x_words_are_same(state->c.W));
+    memcpy(state->x.W, state->c.W, sizeof(state->x));
+
+    /* Replace the generated "x" value in the state with the key prefix */
+    memcpy(state->c.W, key, sizeof(state->x));
+
+    /* Absorb the nonce into the state with an increased number of rounds */
+    state->rounds = DRYSPONGE256_INIT_ROUNDS;
+    state->domain = DRYDOMAIN256_NONCE;
+    if (final_block)
+        state->domain |= DRYDOMAIN256_FINAL;
+    drysponge256_f_absorb(state, nonce, 16);
+    drysponge256_g(state);
+
+    /* Set up the normal number of rounds for future operations */
+    state->rounds = DRYSPONGE256_ROUNDS;
+}
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LW_INTERNAL_DRYSPONGE_H
+#define LW_INTERNAL_DRYSPONGE_H
+
+#include "drygascon.h"
+#include "drygascon128_arm_selector.h"
+
+#include "internal-util.h"
+
+/**
+ * \file internal-drysponge.h
+ * \brief Internal implementation of DrySPONGE for the DryGASCON cipher.
+ *
+ * References: https://github.com/sebastien-riou/DryGASCON
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief Size of the GASCON-128 permutation state in bytes.
+ */
+#define GASCON128_STATE_SIZE 40
+
+/**
+ * \brief Size of the GASCON-256 permutation state in bytes.
+ */
+#define GASCON256_STATE_SIZE 72
+
+/**
+ * \brief Rate of absorption and squeezing for DrySPONGE128.
+ */
+#define DRYSPONGE128_RATE 16
+
+/**
+ * \brief Rate of absorption and squeezing for DrySPONGE256.
+ */
+#define DRYSPONGE256_RATE 16
+
+/**
+ * \brief Size of the "x" value for DrySPONGE128.
+ */
+#define DRYSPONGE128_XSIZE 16
+
+/**
+ * \brief Size of the "x" value for DrySPONGE256.
+ */
+#define DRYSPONGE256_XSIZE 16
+
+/**
+ * \brief Normal number of rounds for DrySPONGE128 when absorbing
+ * and squeezing data.
+ */
+#define DRYSPONGE128_ROUNDS 7
+
+/**
+ * \brief Number of rounds for DrySPONGE128 during initialization.
+ */
+#define DRYSPONGE128_INIT_ROUNDS 11
+
+/**
+ * \brief Normal number of rounds for DrySPONGE256 when absorbing
+ * and squeezing data.
+ */
+#define DRYSPONGE256_ROUNDS 8
+
+/**
+ * \brief Number of rounds for DrySPONGE256 during initialization.
+ */
+#define DRYSPONGE256_INIT_ROUNDS 12
+
+#ifdef DRYGASCON_F_OPT
+
+    /**
+     * \brief DrySPONGE128 domain bit for a padded block.
+     */
+    #define DRYDOMAIN128_PADDED (1 << 0)
+
+    /**
+     * \brief DrySPONGE128 domain bit for a final block.
+     */
+    #define DRYDOMAIN128_FINAL (1 << 1)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the nonce.
+     */
+    #define DRYDOMAIN128_NONCE (1 << 2)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the associated data.
+     */
+    #define DRYDOMAIN128_ASSOC_DATA (2 << 2)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the message.
+     */
+    #define DRYDOMAIN128_MESSAGE (3 << 2)
+
+#else
+
+    /**
+     * \brief DrySPONGE128 domain bit for a padded block.
+     */
+    #define DRYDOMAIN128_PADDED (1 << 8)
+
+    /**
+     * \brief DrySPONGE128 domain bit for a final block.
+     */
+    #define DRYDOMAIN128_FINAL (1 << 9)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the nonce.
+     */
+    #define DRYDOMAIN128_NONCE (1 << 10)
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the associated data.
+     */
+    #define DRYDOMAIN128_ASSOC_DATA (2 << 10)
+
+
+    /**
+     * \brief DrySPONGE128 domain value for processing the message.
+     */
+    #define DRYDOMAIN128_MESSAGE (3 << 10)
+
+#endif
+
+
+/**
+ * \brief DrySPONGE256 domain bit for a padded block.
+ */
+#define DRYDOMAIN256_PADDED (1 << 2)
+
+/**
+ * \brief DrySPONGE256 domain bit for a final block.
+ */
+#define DRYDOMAIN256_FINAL (1 << 3)
+
+/**
+ * \brief DrySPONGE256 domain value for processing the nonce.
+ */
+#define DRYDOMAIN256_NONCE (1 << 4)
+
+/**
+ * \brief DrySPONGE256 domain value for processing the associated data.
+ */
+#define DRYDOMAIN256_ASSOC_DATA (2 << 4)
+
+/**
+ * \brief DrySPONGE256 domain value for processing the message.
+ */
+#define DRYDOMAIN256_MESSAGE (3 << 4)
+
+/**
+ * \brief Internal state of the GASCON-128 permutation.
+ */
+typedef union
+{
+    uint64_t S[GASCON128_STATE_SIZE / 8];   /**< 64-bit words of the state */
+    uint32_t W[GASCON128_STATE_SIZE / 4];   /**< 32-bit words of the state */
+    uint8_t B[GASCON128_STATE_SIZE];        /**< Bytes of the state */
+
+} gascon128_state_t;
+
+/**
+ * \brief Internal state of the GASCON-256 permutation.
+ */
+typedef union
+{
+    uint64_t S[GASCON256_STATE_SIZE / 8];   /**< 64-bit words of the state */
+    uint32_t W[GASCON256_STATE_SIZE / 4];   /**< 32-bit words of the state */
+    uint8_t B[GASCON256_STATE_SIZE];        /**< Bytes of the state */
+
+} gascon256_state_t;
+
+/**
+ * \brief Structure of a rate block for DrySPONGE128.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE128_RATE / 8];      /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE128_RATE / 4];      /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE128_RATE];           /**< Bytes of the rate */
+
+} drysponge128_rate_t;
+
+/**
+ * \brief Structure of a rate block for DrySPONGE256.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE256_RATE / 8];  /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE256_RATE / 4];  /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE256_RATE];       /**< Bytes of the rate */
+
+} drysponge256_rate_t;
+
+/**
+ * \brief Structure of the "x" value for DrySPONGE128.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE128_XSIZE / 8]; /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE128_XSIZE / 4]; /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE128_XSIZE];      /**< Bytes of the rate */
+
+} __attribute__((aligned(16))) drysponge128_x_t;
+
+/**
+ * \brief Structure of the "x" value for DrySPONGE256.
+ */
+typedef union
+{
+    uint64_t S[DRYSPONGE256_XSIZE / 8]; /**< 64-bit words of the rate */
+    uint32_t W[DRYSPONGE256_XSIZE / 4]; /**< 32-bit words of the rate */
+    uint8_t B[DRYSPONGE256_XSIZE];      /**< Bytes of the rate */
+
+} drysponge256_x_t;
+
+/**
+ * \brief Structure of the rolling DrySPONGE128 state.
+ */
+typedef struct
+{
+	  gascon128_state_t c;        /**< GASCON-128 state for the capacity */
+    uint32_t domain;            /**< Domain value to mix on next F call */
+    uint32_t rounds;            /**< Number of rounds for next G call */
+    drysponge128_rate_t r;      /**< Buffer for a rate block of data */
+    drysponge128_x_t x;         /**< "x" value for the sponge */
+} __attribute__((aligned(16))) drysponge128_state_t;
+
+/**
+ * \brief Structure of the rolling DrySPONGE256 state.
+ */
+typedef struct
+{
+    gascon256_state_t c;        /**< GASCON-256 state for the capacity */
+    drysponge256_rate_t r;      /**< Buffer for a rate block of data */
+    drysponge256_x_t x;         /**< "x" value for the sponge */
+    uint32_t domain;            /**< Domain value to mix on next F call */
+    uint32_t rounds;            /**< Number of rounds for next G call */
+
+} drysponge256_state_t;
+
+/**
+ * \brief Permutes the GASCON-128 state using one iteration of CoreRound.
+ *
+ * \param state The GASCON-128 state to be permuted.
+ * \param round The round number.
+ *
+ * The input and output \a state will be in little-endian byte order.
+ */
+void gascon128_core_round(gascon128_state_t *state, uint8_t round);
+
+/**
+ * \brief Permutes the GASCON-256 state using one iteration of CoreRound.
+ *
+ * \param state The GASCON-256 state to be permuted.
+ * \param round The round number.
+ *
+ * The input and output \a state will be in little-endian byte order.
+ */
+void gascon256_core_round(gascon256_state_t *state, uint8_t round);
+
+/**
+ * \brief Performs the DrySPONGE128 G function which runs the core
+ * rounds and squeezes data out of the GASGON-128 state.
+ *
+ * \param state The DrySPONGE128 state.
+ *
+ * The data that is squeezed out will be in state->r on exit.
+ */
+void drysponge128_g(drysponge128_state_t *state);
+
+/**
+ * \brief Performs the DrySPONGE256 G function which runs the core
+ * rounds and squeezes data out of the GASGON-256 state.
+ *
+ * \param state The DrySPONGE256 state.
+ *
+ * The data that is squeezed out will be in state->r on exit.
+ */
+void drysponge256_g(drysponge256_state_t *state);
+
+/**
+ * \brief Performs the DrySPONGE128 G function which runs the core
+ * rounds but does not squeeze out any output.
+ *
+ * \param state The DrySPONGE128 state.
+ */
+void drysponge128_g_core(drysponge128_state_t *state);
+
+/**
+ * \brief Performs the DrySPONGE256 G function which runs the core
+ * rounds but does not squeeze out any output.
+ *
+ * \param state The DrySPONGE256 state.
+ */
+void drysponge256_g_core(drysponge256_state_t *state);
+
+/**
+ * \brief Performs the absorption phase of the DrySPONGE256 F function.
+ *
+ * \param state The DrySPONGE256 state.
+ * \param input The block of input data to incorporate into the state.
+ * \param len The length of the input block, which must be less than
+ * or equal to DRYSPONGE256_RATE.  Smaller input blocks will be padded.
+ *
+ * This function must be followed by a call to drysponge256_g() or
+ * drysponge256_g_core() to perform the full F operation.
+ */
+void drysponge256_f_absorb
+    (drysponge256_state_t *state, const unsigned char *input, unsigned len);
+
+void drygascon128_f_wrap(drysponge128_state_t *state, const unsigned char *input, unsigned len);
+
+/**
+ * \brief Determine if state alignement is safe vs timing attacks.
+ *
+ * \param state Points to the state to check.
+ *
+ * \return Non-zero if alignement is safe.
+ *
+ * We expect this to be completly optimized out by compiler if the alignement is enforced at build time
+ */
+int drysponge128_safe_alignement(const drysponge128_state_t*state);
+
+/**
+ * \brief Set up a DrySPONGE128 state to begin encryption or decryption.
+ *
+ * \param state The DrySPONGE128 state.
+ * \param key Points to the 16 bytes of the key.
+ * \param nonce Points to the 16 bytes of the nonce.
+ * \param final_block Non-zero if after key setup there will be no more blocks.
+ */
+void drysponge128_setup
+    (drysponge128_state_t *state, const unsigned char *key, unsigned int keysize,
+     const unsigned char *nonce, int final_block);
+
+/**
+ * \brief Set up a DrySPONGE256 state to begin encryption or decryption.
+ *
+ * \param state The DrySPONGE256 state.
+ * \param key Points to the 32 bytes of the key.
+ * \param nonce Points to the 16 bytes of the nonce.
+ * \param final_block Non-zero if after key setup there will be no more blocks.
+ */
+void drysponge256_setup
+    (drysponge256_state_t *state, const unsigned char *key,
+     const unsigned char *nonce, int final_block);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-util.h
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-util.h
+/*
+ * Copyright (C) 2020 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LW_INTERNAL_UTIL_H
+#define LW_INTERNAL_UTIL_H
+
+#include <stdint.h>
+
+/* Figure out how to inline functions using this C compiler */
+#if defined(__STDC__) && __STDC_VERSION__ >= 199901L
+#define STATIC_INLINE static inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define STATIC_INLINE static __inline__
+#else
+#define STATIC_INLINE static
+#endif
+
+/* Try to figure out whether the CPU is little-endian or big-endian.
+ * May need to modify this to include new compiler-specific defines.
+ * Alternatively, define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ in your
+ * compiler flags when you compile this library */
+#if defined(__x86_64) || defined(__x86_64__) || \
+    defined(__i386) || defined(__i386__) || \
+    defined(__AVR__) || defined(__arm) || defined(__arm__) || \
+    defined(_M_AMD64) || defined(_M_X64) || defined(_M_IX86) || \
+    defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM_FP) || \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == 1234) || \
+    defined(__LITTLE_ENDIAN__)
+#define LW_UTIL_LITTLE_ENDIAN 1
+#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == 4321) || \
+    defined(__BIG_ENDIAN__)
+/* Big endian */
+#else
+#error "Cannot determine the endianess of this platform"
+#endif
+
+/* Helper macros to load and store values while converting endian-ness */
+
+/* Load a big-endian 32-bit word from a byte buffer */
+#define be_load_word32(ptr) \
+    ((((uint32_t)((ptr)[0])) << 24) | \
+     (((uint32_t)((ptr)[1])) << 16) | \
+     (((uint32_t)((ptr)[2])) << 8) | \
+      ((uint32_t)((ptr)[3])))
+
+/* Store a big-endian 32-bit word into a byte buffer */
+#define be_store_word32(ptr, x) \
+    do { \
+        uint32_t _x = (x); \
+        (ptr)[0] = (uint8_t)(_x >> 24); \
+        (ptr)[1] = (uint8_t)(_x >> 16); \
+        (ptr)[2] = (uint8_t)(_x >> 8); \
+        (ptr)[3] = (uint8_t)_x; \
+    } while (0)
+
+/* Load a little-endian 32-bit word from a byte buffer */
+#define le_load_word32(ptr) \
+    ((((uint32_t)((ptr)[3])) << 24) | \
+     (((uint32_t)((ptr)[2])) << 16) | \
+     (((uint32_t)((ptr)[1])) << 8) | \
+      ((uint32_t)((ptr)[0])))
+
+/* Store a little-endian 32-bit word into a byte buffer */
+#define le_store_word32(ptr, x) \
+    do { \
+        uint32_t _x = (x); \
+        (ptr)[0] = (uint8_t)_x; \
+        (ptr)[1] = (uint8_t)(_x >> 8); \
+        (ptr)[2] = (uint8_t)(_x >> 16); \
+        (ptr)[3] = (uint8_t)(_x >> 24); \
+    } while (0)
+
+/* Load a big-endian 64-bit word from a byte buffer */
+#define be_load_word64(ptr) \
+    ((((uint64_t)((ptr)[0])) << 56) | \
+     (((uint64_t)((ptr)[1])) << 48) | \
+     (((uint64_t)((ptr)[2])) << 40) | \
+     (((uint64_t)((ptr)[3])) << 32) | \
+     (((uint64_t)((ptr)[4])) << 24) | \
+     (((uint64_t)((ptr)[5])) << 16) | \
+     (((uint64_t)((ptr)[6])) << 8) | \
+      ((uint64_t)((ptr)[7])))
+
+/* Store a big-endian 64-bit word into a byte buffer */
+#define be_store_word64(ptr, x) \
+    do { \
+        uint64_t _x = (x); \
+        (ptr)[0] = (uint8_t)(_x >> 56); \
+        (ptr)[1] = (uint8_t)(_x >> 48); \
+        (ptr)[2] = (uint8_t)(_x >> 40); \
+        (ptr)[3] = (uint8_t)(_x >> 32); \
+        (ptr)[4] = (uint8_t)(_x >> 24); \
+        (ptr)[5] = (uint8_t)(_x >> 16); \
+        (ptr)[6] = (uint8_t)(_x >> 8); \
+        (ptr)[7] = (uint8_t)_x; \
+    } while (0)
+
+/* Load a little-endian 64-bit word from a byte buffer */
+#define le_load_word64(ptr) \
+    ((((uint64_t)((ptr)[7])) << 56) | \
+     (((uint64_t)((ptr)[6])) << 48) | \
+     (((uint64_t)((ptr)[5])) << 40) | \
+     (((uint64_t)((ptr)[4])) << 32) | \
+     (((uint64_t)((ptr)[3])) << 24) | \
+     (((uint64_t)((ptr)[2])) << 16) | \
+     (((uint64_t)((ptr)[1])) << 8) | \
+      ((uint64_t)((ptr)[0])))
+
+/* Store a little-endian 64-bit word into a byte buffer */
+#define le_store_word64(ptr, x) \
+    do { \
+        uint64_t _x = (x); \
+        (ptr)[0] = (uint8_t)_x; \
+        (ptr)[1] = (uint8_t)(_x >> 8); \
+        (ptr)[2] = (uint8_t)(_x >> 16); \
+        (ptr)[3] = (uint8_t)(_x >> 24); \
+        (ptr)[4] = (uint8_t)(_x >> 32); \
+        (ptr)[5] = (uint8_t)(_x >> 40); \
+        (ptr)[6] = (uint8_t)(_x >> 48); \
+        (ptr)[7] = (uint8_t)(_x >> 56); \
+    } while (0)
+
+/* Load a big-endian 16-bit word from a byte buffer */
+#define be_load_word16(ptr) \
+    ((((uint16_t)((ptr)[0])) << 8) | \
+      ((uint16_t)((ptr)[1])))
+
+/* Store a big-endian 16-bit word into a byte buffer */
+#define be_store_word16(ptr, x) \
+    do { \
+        uint16_t _x = (x); \
+        (ptr)[0] = (uint8_t)(_x >> 8); \
+        (ptr)[1] = (uint8_t)_x; \
+    } while (0)
+
+/* Load a little-endian 16-bit word from a byte buffer */
+#define le_load_word16(ptr) \
+    ((((uint16_t)((ptr)[1])) << 8) | \
+      ((uint16_t)((ptr)[0])))
+
+/* Store a little-endian 16-bit word into a byte buffer */
+#define le_store_word16(ptr, x) \
+    do { \
+        uint16_t _x = (x); \
+        (ptr)[0] = (uint8_t)_x; \
+        (ptr)[1] = (uint8_t)(_x >> 8); \
+    } while (0)
+
+/* XOR a source byte buffer against a destination */
+#define lw_xor_block(dest, src, len) \
+    do { \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src = (src); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            *_dest++ ^= *_src++; \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR two source byte buffers and put the result in a destination buffer */
+#define lw_xor_block_2_src(dest, src1, src2, len) \
+    do { \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src1 = (src1); \
+        const unsigned char *_src2 = (src2); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            *_dest++ = *_src1++ ^ *_src2++; \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR a source byte buffer against a destination and write to another
+ * destination at the same time */
+#define lw_xor_block_2_dest(dest2, dest, src, len) \
+    do { \
+        unsigned char *_dest2 = (dest2); \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src = (src); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            *_dest2++ = (*_dest++ ^= *_src++); \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR two byte buffers and write to a destination which at the same
+ * time copying the contents of src2 to dest2 */
+#define lw_xor_block_copy_src(dest2, dest, src1, src2, len) \
+    do { \
+        unsigned char *_dest2 = (dest2); \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src1 = (src1); \
+        const unsigned char *_src2 = (src2); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            unsigned char _temp = *_src2++; \
+            *_dest2++ = _temp; \
+            *_dest++ = *_src1++ ^ _temp; \
+            --_len; \
+        } \
+    } while (0)
+
+/* XOR a source byte buffer against a destination and write to another
+ * destination at the same time.  This version swaps the source value
+ * into the "dest" buffer */
+#define lw_xor_block_swap(dest2, dest, src, len) \
+    do { \
+        unsigned char *_dest2 = (dest2); \
+        unsigned char *_dest = (dest); \
+        const unsigned char *_src = (src); \
+        unsigned _len = (len); \
+        while (_len > 0) { \
+            unsigned char _temp = *_src++; \
+            *_dest2++ = *_dest ^ _temp; \
+            *_dest++ = _temp; \
+            --_len; \
+        } \
+    } while (0)
+
+/* Rotation functions need to be optimised for best performance on AVR.
+ * The most efficient rotations are where the number of bits is 1 or a
+ * multiple of 8, so we compose the efficient rotations to produce all
+ * other rotation counts of interest. */
+
+#if defined(__AVR__)
+#define LW_CRYPTO_ROTATE32_COMPOSED 1
+#else
+#define LW_CRYPTO_ROTATE32_COMPOSED 0
+#endif
+
+/* Rotation macros for 32-bit arguments */
+
+/* Generic left rotate */
+#define leftRotate(a, bits) \
+    (__extension__ ({ \
+        uint32_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (32 - (bits))); \
+    }))
+
+/* Generic right rotate */
+#define rightRotate(a, bits) \
+    (__extension__ ({ \
+        uint32_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (32 - (bits))); \
+    }))
+
+#if !LW_CRYPTO_ROTATE32_COMPOSED
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1(a)  (leftRotate((a), 1))
+#define leftRotate2(a)  (leftRotate((a), 2))
+#define leftRotate3(a)  (leftRotate((a), 3))
+#define leftRotate4(a)  (leftRotate((a), 4))
+#define leftRotate5(a)  (leftRotate((a), 5))
+#define leftRotate6(a)  (leftRotate((a), 6))
+#define leftRotate7(a)  (leftRotate((a), 7))
+#define leftRotate8(a)  (leftRotate((a), 8))
+#define leftRotate9(a)  (leftRotate((a), 9))
+#define leftRotate10(a) (leftRotate((a), 10))
+#define leftRotate11(a) (leftRotate((a), 11))
+#define leftRotate12(a) (leftRotate((a), 12))
+#define leftRotate13(a) (leftRotate((a), 13))
+#define leftRotate14(a) (leftRotate((a), 14))
+#define leftRotate15(a) (leftRotate((a), 15))
+#define leftRotate16(a) (leftRotate((a), 16))
+#define leftRotate17(a) (leftRotate((a), 17))
+#define leftRotate18(a) (leftRotate((a), 18))
+#define leftRotate19(a) (leftRotate((a), 19))
+#define leftRotate20(a) (leftRotate((a), 20))
+#define leftRotate21(a) (leftRotate((a), 21))
+#define leftRotate22(a) (leftRotate((a), 22))
+#define leftRotate23(a) (leftRotate((a), 23))
+#define leftRotate24(a) (leftRotate((a), 24))
+#define leftRotate25(a) (leftRotate((a), 25))
+#define leftRotate26(a) (leftRotate((a), 26))
+#define leftRotate27(a) (leftRotate((a), 27))
+#define leftRotate28(a) (leftRotate((a), 28))
+#define leftRotate29(a) (leftRotate((a), 29))
+#define leftRotate30(a) (leftRotate((a), 30))
+#define leftRotate31(a) (leftRotate((a), 31))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1(a)  (rightRotate((a), 1))
+#define rightRotate2(a)  (rightRotate((a), 2))
+#define rightRotate3(a)  (rightRotate((a), 3))
+#define rightRotate4(a)  (rightRotate((a), 4))
+#define rightRotate5(a)  (rightRotate((a), 5))
+#define rightRotate6(a)  (rightRotate((a), 6))
+#define rightRotate7(a)  (rightRotate((a), 7))
+#define rightRotate8(a)  (rightRotate((a), 8))
+#define rightRotate9(a)  (rightRotate((a), 9))
+#define rightRotate10(a) (rightRotate((a), 10))
+#define rightRotate11(a) (rightRotate((a), 11))
+#define rightRotate12(a) (rightRotate((a), 12))
+#define rightRotate13(a) (rightRotate((a), 13))
+#define rightRotate14(a) (rightRotate((a), 14))
+#define rightRotate15(a) (rightRotate((a), 15))
+#define rightRotate16(a) (rightRotate((a), 16))
+#define rightRotate17(a) (rightRotate((a), 17))
+#define rightRotate18(a) (rightRotate((a), 18))
+#define rightRotate19(a) (rightRotate((a), 19))
+#define rightRotate20(a) (rightRotate((a), 20))
+#define rightRotate21(a) (rightRotate((a), 21))
+#define rightRotate22(a) (rightRotate((a), 22))
+#define rightRotate23(a) (rightRotate((a), 23))
+#define rightRotate24(a) (rightRotate((a), 24))
+#define rightRotate25(a) (rightRotate((a), 25))
+#define rightRotate26(a) (rightRotate((a), 26))
+#define rightRotate27(a) (rightRotate((a), 27))
+#define rightRotate28(a) (rightRotate((a), 28))
+#define rightRotate29(a) (rightRotate((a), 29))
+#define rightRotate30(a) (rightRotate((a), 30))
+#define rightRotate31(a) (rightRotate((a), 31))
+
+#else /* LW_CRYPTO_ROTATE32_COMPOSED */
+
+/* Composed rotation macros where 1 and 8 are fast, but others are slow */
+
+/* Left rotate by 1 */
+#define leftRotate1(a)  (leftRotate((a), 1))
+
+/* Left rotate by 2 */
+#define leftRotate2(a)  (leftRotate(leftRotate((a), 1), 1))
+
+/* Left rotate by 3 */
+#define leftRotate3(a)  (leftRotate(leftRotate(leftRotate((a), 1), 1), 1))
+
+/* Left rotate by 4 */
+#define leftRotate4(a)  (leftRotate(leftRotate(leftRotate(leftRotate((a), 1), 1), 1), 1))
+
+/* Left rotate by 5: Rotate left by 8, then right by 3 */
+#define leftRotate5(a)  (rightRotate(rightRotate(rightRotate(leftRotate((a), 8), 1), 1), 1))
+
+/* Left rotate by 6: Rotate left by 8, then right by 2 */
+#define leftRotate6(a)  (rightRotate(rightRotate(leftRotate((a), 8), 1), 1))
+
+/* Left rotate by 7: Rotate left by 8, then right by 1 */
+#define leftRotate7(a)  (rightRotate(leftRotate((a), 8), 1))
+
+/* Left rotate by 8 */
+#define leftRotate8(a)  (leftRotate((a), 8))
+
+/* Left rotate by 9: Rotate left by 8, then left by 1 */
+#define leftRotate9(a)  (leftRotate(leftRotate((a), 8), 1))
+
+/* Left rotate by 10: Rotate left by 8, then left by 2 */
+#define leftRotate10(a) (leftRotate(leftRotate(leftRotate((a), 8), 1), 1))
+
+/* Left rotate by 11: Rotate left by 8, then left by 3 */
+#define leftRotate11(a) (leftRotate(leftRotate(leftRotate(leftRotate((a), 8), 1), 1), 1))
+
+/* Left rotate by 12: Rotate left by 16, then right by 4 */
+#define leftRotate12(a) (rightRotate(rightRotate(rightRotate(rightRotate(leftRotate((a), 16), 1), 1), 1), 1))
+
+/* Left rotate by 13: Rotate left by 16, then right by 3 */
+#define leftRotate13(a) (rightRotate(rightRotate(rightRotate(leftRotate((a), 16), 1), 1), 1))
+
+/* Left rotate by 14: Rotate left by 16, then right by 2 */
+#define leftRotate14(a) (rightRotate(rightRotate(leftRotate((a), 16), 1), 1))
+
+/* Left rotate by 15: Rotate left by 16, then right by 1 */
+#define leftRotate15(a) (rightRotate(leftRotate((a), 16), 1))
+
+/* Left rotate by 16 */
+#define leftRotate16(a) (leftRotate((a), 16))
+
+/* Left rotate by 17: Rotate left by 16, then left by 1 */
+#define leftRotate17(a) (leftRotate(leftRotate((a), 16), 1))
+
+/* Left rotate by 18: Rotate left by 16, then left by 2 */
+#define leftRotate18(a) (leftRotate(leftRotate(leftRotate((a), 16), 1), 1))
+
+/* Left rotate by 19: Rotate left by 16, then left by 3 */
+#define leftRotate19(a) (leftRotate(leftRotate(leftRotate(leftRotate((a), 16), 1), 1), 1))
+
+/* Left rotate by 20: Rotate left by 16, then left by 4 */
+#define leftRotate20(a) (leftRotate(leftRotate(leftRotate(leftRotate(leftRotate((a), 16), 1), 1), 1), 1))
+
+/* Left rotate by 21: Rotate left by 24, then right by 3 */
+#define leftRotate21(a) (rightRotate(rightRotate(rightRotate(leftRotate((a), 24), 1), 1), 1))
+
+/* Left rotate by 22: Rotate left by 24, then right by 2 */
+#define leftRotate22(a) (rightRotate(rightRotate(leftRotate((a), 24), 1), 1))
+
+/* Left rotate by 23: Rotate left by 24, then right by 1 */
+#define leftRotate23(a) (rightRotate(leftRotate((a), 24), 1))
+
+/* Left rotate by 24 */
+#define leftRotate24(a) (leftRotate((a), 24))
+
+/* Left rotate by 25: Rotate left by 24, then left by 1 */
+#define leftRotate25(a) (leftRotate(leftRotate((a), 24), 1))
+
+/* Left rotate by 26: Rotate left by 24, then left by 2 */
+#define leftRotate26(a) (leftRotate(leftRotate(leftRotate((a), 24), 1), 1))
+
+/* Left rotate by 27: Rotate left by 24, then left by 3 */
+#define leftRotate27(a) (leftRotate(leftRotate(leftRotate(leftRotate((a), 24), 1), 1), 1))
+
+/* Left rotate by 28: Rotate right by 4 */
+#define leftRotate28(a) (rightRotate(rightRotate(rightRotate(rightRotate((a), 1), 1), 1), 1))
+
+/* Left rotate by 29: Rotate right by 3 */
+#define leftRotate29(a) (rightRotate(rightRotate(rightRotate((a), 1), 1), 1))
+
+/* Left rotate by 30: Rotate right by 2 */
+#define leftRotate30(a) (rightRotate(rightRotate((a), 1), 1))
+
+/* Left rotate by 31: Rotate right by 1 */
+#define leftRotate31(a) (rightRotate((a), 1))
+
+/* Define the 32-bit right rotations in terms of left rotations */
+#define rightRotate1(a)  (leftRotate31((a)))
+#define rightRotate2(a)  (leftRotate30((a)))
+#define rightRotate3(a)  (leftRotate29((a)))
+#define rightRotate4(a)  (leftRotate28((a)))
+#define rightRotate5(a)  (leftRotate27((a)))
+#define rightRotate6(a)  (leftRotate26((a)))
+#define rightRotate7(a)  (leftRotate25((a)))
+#define rightRotate8(a)  (leftRotate24((a)))
+#define rightRotate9(a)  (leftRotate23((a)))
+#define rightRotate10(a) (leftRotate22((a)))
+#define rightRotate11(a) (leftRotate21((a)))
+#define rightRotate12(a) (leftRotate20((a)))
+#define rightRotate13(a) (leftRotate19((a)))
+#define rightRotate14(a) (leftRotate18((a)))
+#define rightRotate15(a) (leftRotate17((a)))
+#define rightRotate16(a) (leftRotate16((a)))
+#define rightRotate17(a) (leftRotate15((a)))
+#define rightRotate18(a) (leftRotate14((a)))
+#define rightRotate19(a) (leftRotate13((a)))
+#define rightRotate20(a) (leftRotate12((a)))
+#define rightRotate21(a) (leftRotate11((a)))
+#define rightRotate22(a) (leftRotate10((a)))
+#define rightRotate23(a) (leftRotate9((a)))
+#define rightRotate24(a) (leftRotate8((a)))
+#define rightRotate25(a) (leftRotate7((a)))
+#define rightRotate26(a) (leftRotate6((a)))
+#define rightRotate27(a) (leftRotate5((a)))
+#define rightRotate28(a) (leftRotate4((a)))
+#define rightRotate29(a) (leftRotate3((a)))
+#define rightRotate30(a) (leftRotate2((a)))
+#define rightRotate31(a) (leftRotate1((a)))
+
+#endif /* LW_CRYPTO_ROTATE32_COMPOSED */
+
+/* Rotation macros for 64-bit arguments */
+
+/* Generic left rotate */
+#define leftRotate_64(a, bits) \
+    (__extension__ ({ \
+        uint64_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (64 - (bits))); \
+    }))
+
+/* Generic right rotate */
+#define rightRotate_64(a, bits) \
+    (__extension__ ({ \
+        uint64_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (64 - (bits))); \
+    }))
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1_64(a)  (leftRotate_64((a), 1))
+#define leftRotate2_64(a)  (leftRotate_64((a), 2))
+#define leftRotate3_64(a)  (leftRotate_64((a), 3))
+#define leftRotate4_64(a)  (leftRotate_64((a), 4))
+#define leftRotate5_64(a)  (leftRotate_64((a), 5))
+#define leftRotate6_64(a)  (leftRotate_64((a), 6))
+#define leftRotate7_64(a)  (leftRotate_64((a), 7))
+#define leftRotate8_64(a)  (leftRotate_64((a), 8))
+#define leftRotate9_64(a)  (leftRotate_64((a), 9))
+#define leftRotate10_64(a) (leftRotate_64((a), 10))
+#define leftRotate11_64(a) (leftRotate_64((a), 11))
+#define leftRotate12_64(a) (leftRotate_64((a), 12))
+#define leftRotate13_64(a) (leftRotate_64((a), 13))
+#define leftRotate14_64(a) (leftRotate_64((a), 14))
+#define leftRotate15_64(a) (leftRotate_64((a), 15))
+#define leftRotate16_64(a) (leftRotate_64((a), 16))
+#define leftRotate17_64(a) (leftRotate_64((a), 17))
+#define leftRotate18_64(a) (leftRotate_64((a), 18))
+#define leftRotate19_64(a) (leftRotate_64((a), 19))
+#define leftRotate20_64(a) (leftRotate_64((a), 20))
+#define leftRotate21_64(a) (leftRotate_64((a), 21))
+#define leftRotate22_64(a) (leftRotate_64((a), 22))
+#define leftRotate23_64(a) (leftRotate_64((a), 23))
+#define leftRotate24_64(a) (leftRotate_64((a), 24))
+#define leftRotate25_64(a) (leftRotate_64((a), 25))
+#define leftRotate26_64(a) (leftRotate_64((a), 26))
+#define leftRotate27_64(a) (leftRotate_64((a), 27))
+#define leftRotate28_64(a) (leftRotate_64((a), 28))
+#define leftRotate29_64(a) (leftRotate_64((a), 29))
+#define leftRotate30_64(a) (leftRotate_64((a), 30))
+#define leftRotate31_64(a) (leftRotate_64((a), 31))
+#define leftRotate32_64(a) (leftRotate_64((a), 32))
+#define leftRotate33_64(a) (leftRotate_64((a), 33))
+#define leftRotate34_64(a) (leftRotate_64((a), 34))
+#define leftRotate35_64(a) (leftRotate_64((a), 35))
+#define leftRotate36_64(a) (leftRotate_64((a), 36))
+#define leftRotate37_64(a) (leftRotate_64((a), 37))
+#define leftRotate38_64(a) (leftRotate_64((a), 38))
+#define leftRotate39_64(a) (leftRotate_64((a), 39))
+#define leftRotate40_64(a) (leftRotate_64((a), 40))
+#define leftRotate41_64(a) (leftRotate_64((a), 41))
+#define leftRotate42_64(a) (leftRotate_64((a), 42))
+#define leftRotate43_64(a) (leftRotate_64((a), 43))
+#define leftRotate44_64(a) (leftRotate_64((a), 44))
+#define leftRotate45_64(a) (leftRotate_64((a), 45))
+#define leftRotate46_64(a) (leftRotate_64((a), 46))
+#define leftRotate47_64(a) (leftRotate_64((a), 47))
+#define leftRotate48_64(a) (leftRotate_64((a), 48))
+#define leftRotate49_64(a) (leftRotate_64((a), 49))
+#define leftRotate50_64(a) (leftRotate_64((a), 50))
+#define leftRotate51_64(a) (leftRotate_64((a), 51))
+#define leftRotate52_64(a) (leftRotate_64((a), 52))
+#define leftRotate53_64(a) (leftRotate_64((a), 53))
+#define leftRotate54_64(a) (leftRotate_64((a), 54))
+#define leftRotate55_64(a) (leftRotate_64((a), 55))
+#define leftRotate56_64(a) (leftRotate_64((a), 56))
+#define leftRotate57_64(a) (leftRotate_64((a), 57))
+#define leftRotate58_64(a) (leftRotate_64((a), 58))
+#define leftRotate59_64(a) (leftRotate_64((a), 59))
+#define leftRotate60_64(a) (leftRotate_64((a), 60))
+#define leftRotate61_64(a) (leftRotate_64((a), 61))
+#define leftRotate62_64(a) (leftRotate_64((a), 62))
+#define leftRotate63_64(a) (leftRotate_64((a), 63))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1_64(a)  (rightRotate_64((a), 1))
+#define rightRotate2_64(a)  (rightRotate_64((a), 2))
+#define rightRotate3_64(a)  (rightRotate_64((a), 3))
+#define rightRotate4_64(a)  (rightRotate_64((a), 4))
+#define rightRotate5_64(a)  (rightRotate_64((a), 5))
+#define rightRotate6_64(a)  (rightRotate_64((a), 6))
+#define rightRotate7_64(a)  (rightRotate_64((a), 7))
+#define rightRotate8_64(a)  (rightRotate_64((a), 8))
+#define rightRotate9_64(a)  (rightRotate_64((a), 9))
+#define rightRotate10_64(a) (rightRotate_64((a), 10))
+#define rightRotate11_64(a) (rightRotate_64((a), 11))
+#define rightRotate12_64(a) (rightRotate_64((a), 12))
+#define rightRotate13_64(a) (rightRotate_64((a), 13))
+#define rightRotate14_64(a) (rightRotate_64((a), 14))
+#define rightRotate15_64(a) (rightRotate_64((a), 15))
+#define rightRotate16_64(a) (rightRotate_64((a), 16))
+#define rightRotate17_64(a) (rightRotate_64((a), 17))
+#define rightRotate18_64(a) (rightRotate_64((a), 18))
+#define rightRotate19_64(a) (rightRotate_64((a), 19))
+#define rightRotate20_64(a) (rightRotate_64((a), 20))
+#define rightRotate21_64(a) (rightRotate_64((a), 21))
+#define rightRotate22_64(a) (rightRotate_64((a), 22))
+#define rightRotate23_64(a) (rightRotate_64((a), 23))
+#define rightRotate24_64(a) (rightRotate_64((a), 24))
+#define rightRotate25_64(a) (rightRotate_64((a), 25))
+#define rightRotate26_64(a) (rightRotate_64((a), 26))
+#define rightRotate27_64(a) (rightRotate_64((a), 27))
+#define rightRotate28_64(a) (rightRotate_64((a), 28))
+#define rightRotate29_64(a) (rightRotate_64((a), 29))
+#define rightRotate30_64(a) (rightRotate_64((a), 30))
+#define rightRotate31_64(a) (rightRotate_64((a), 31))
+#define rightRotate32_64(a) (rightRotate_64((a), 32))
+#define rightRotate33_64(a) (rightRotate_64((a), 33))
+#define rightRotate34_64(a) (rightRotate_64((a), 34))
+#define rightRotate35_64(a) (rightRotate_64((a), 35))
+#define rightRotate36_64(a) (rightRotate_64((a), 36))
+#define rightRotate37_64(a) (rightRotate_64((a), 37))
+#define rightRotate38_64(a) (rightRotate_64((a), 38))
+#define rightRotate39_64(a) (rightRotate_64((a), 39))
+#define rightRotate40_64(a) (rightRotate_64((a), 40))
+#define rightRotate41_64(a) (rightRotate_64((a), 41))
+#define rightRotate42_64(a) (rightRotate_64((a), 42))
+#define rightRotate43_64(a) (rightRotate_64((a), 43))
+#define rightRotate44_64(a) (rightRotate_64((a), 44))
+#define rightRotate45_64(a) (rightRotate_64((a), 45))
+#define rightRotate46_64(a) (rightRotate_64((a), 46))
+#define rightRotate47_64(a) (rightRotate_64((a), 47))
+#define rightRotate48_64(a) (rightRotate_64((a), 48))
+#define rightRotate49_64(a) (rightRotate_64((a), 49))
+#define rightRotate50_64(a) (rightRotate_64((a), 50))
+#define rightRotate51_64(a) (rightRotate_64((a), 51))
+#define rightRotate52_64(a) (rightRotate_64((a), 52))
+#define rightRotate53_64(a) (rightRotate_64((a), 53))
+#define rightRotate54_64(a) (rightRotate_64((a), 54))
+#define rightRotate55_64(a) (rightRotate_64((a), 55))
+#define rightRotate56_64(a) (rightRotate_64((a), 56))
+#define rightRotate57_64(a) (rightRotate_64((a), 57))
+#define rightRotate58_64(a) (rightRotate_64((a), 58))
+#define rightRotate59_64(a) (rightRotate_64((a), 59))
+#define rightRotate60_64(a) (rightRotate_64((a), 60))
+#define rightRotate61_64(a) (rightRotate_64((a), 61))
+#define rightRotate62_64(a) (rightRotate_64((a), 62))
+#define rightRotate63_64(a) (rightRotate_64((a), 63))
+
+/* Rotate a 16-bit value left by a number of bits */
+#define leftRotate_16(a, bits) \
+    (__extension__ ({ \
+        uint16_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (16 - (bits))); \
+    }))
+
+/* Rotate a 16-bit value right by a number of bits */
+#define rightRotate_16(a, bits) \
+    (__extension__ ({ \
+        uint16_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (16 - (bits))); \
+    }))
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1_16(a)  (leftRotate_16((a), 1))
+#define leftRotate2_16(a)  (leftRotate_16((a), 2))
+#define leftRotate3_16(a)  (leftRotate_16((a), 3))
+#define leftRotate4_16(a)  (leftRotate_16((a), 4))
+#define leftRotate5_16(a)  (leftRotate_16((a), 5))
+#define leftRotate6_16(a)  (leftRotate_16((a), 6))
+#define leftRotate7_16(a)  (leftRotate_16((a), 7))
+#define leftRotate8_16(a)  (leftRotate_16((a), 8))
+#define leftRotate9_16(a)  (leftRotate_16((a), 9))
+#define leftRotate10_16(a) (leftRotate_16((a), 10))
+#define leftRotate11_16(a) (leftRotate_16((a), 11))
+#define leftRotate12_16(a) (leftRotate_16((a), 12))
+#define leftRotate13_16(a) (leftRotate_16((a), 13))
+#define leftRotate14_16(a) (leftRotate_16((a), 14))
+#define leftRotate15_16(a) (leftRotate_16((a), 15))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1_16(a)  (rightRotate_16((a), 1))
+#define rightRotate2_16(a)  (rightRotate_16((a), 2))
+#define rightRotate3_16(a)  (rightRotate_16((a), 3))
+#define rightRotate4_16(a)  (rightRotate_16((a), 4))
+#define rightRotate5_16(a)  (rightRotate_16((a), 5))
+#define rightRotate6_16(a)  (rightRotate_16((a), 6))
+#define rightRotate7_16(a)  (rightRotate_16((a), 7))
+#define rightRotate8_16(a)  (rightRotate_16((a), 8))
+#define rightRotate9_16(a)  (rightRotate_16((a), 9))
+#define rightRotate10_16(a) (rightRotate_16((a), 10))
+#define rightRotate11_16(a) (rightRotate_16((a), 11))
+#define rightRotate12_16(a) (rightRotate_16((a), 12))
+#define rightRotate13_16(a) (rightRotate_16((a), 13))
+#define rightRotate14_16(a) (rightRotate_16((a), 14))
+#define rightRotate15_16(a) (rightRotate_16((a), 15))
+
+/* Rotate an 8-bit value left by a number of bits */
+#define leftRotate_8(a, bits) \
+    (__extension__ ({ \
+        uint8_t _temp = (a); \
+        (_temp << (bits)) | (_temp >> (8 - (bits))); \
+    }))
+
+/* Rotate an 8-bit value right by a number of bits */
+#define rightRotate_8(a, bits) \
+    (__extension__ ({ \
+        uint8_t _temp = (a); \
+        (_temp >> (bits)) | (_temp << (8 - (bits))); \
+    }))
+
+/* Left rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define leftRotate1_8(a)  (leftRotate_8((a), 1))
+#define leftRotate2_8(a)  (leftRotate_8((a), 2))
+#define leftRotate3_8(a)  (leftRotate_8((a), 3))
+#define leftRotate4_8(a)  (leftRotate_8((a), 4))
+#define leftRotate5_8(a)  (leftRotate_8((a), 5))
+#define leftRotate6_8(a)  (leftRotate_8((a), 6))
+#define leftRotate7_8(a)  (leftRotate_8((a), 7))
+
+/* Right rotate by a specific number of bits.  These macros may be replaced
+ * with more efficient ones on platforms that lack a barrel shifter */
+#define rightRotate1_8(a)  (rightRotate_8((a), 1))
+#define rightRotate2_8(a)  (rightRotate_8((a), 2))
+#define rightRotate3_8(a)  (rightRotate_8((a), 3))
+#define rightRotate4_8(a)  (rightRotate_8((a), 4))
+#define rightRotate5_8(a)  (rightRotate_8((a), 5))
+#define rightRotate6_8(a)  (rightRotate_8((a), 6))
+#define rightRotate7_8(a)  (rightRotate_8((a), 7))
+
+#endif
--- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/nistlwc
+++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/nistlwc
--- a/drygascon/Implementations/crypto_aead/drygascon128/designers
+++ b/drygascon/Implementations/crypto_aead/drygascon128/designers
+Sebastien Riou
--- a/drygascon/Implementations/crypto_aead/drygascon128/ref/implementors
+++ b/drygascon/Implementations/crypto_aead/drygascon128/ref/implementors
+Sebastien Riou
--- a/drygascon/Implementations/crypto_aead/drygascon128/ref/nistlwc
+++ b/drygascon/Implementations/crypto_aead/drygascon128/ref/nistlwc