Commit 4063295f by Martin Schlaeffer Committed by Sebastian Renner

new ascon

parent f8a89b2c
cmake . -DALG_LIST=$ALG -DTEST_LIST=getcycles
for I in $IMPL; do
for M in 0 1; do
for P in 0 1; do
for U in 0 1; do
rm getcycles_*
rm *.a
cmake --build .
CYCLES=$(./getcycles_* $FACTOR | tail -n 1)
SIZE=$(size -t *.a | grep TOTALS | awk '{print $1}')
echo -e " $SIZE \t$CYCLES IM=${M} IP=${P} UL=${U} | $I" | grep -v "^0" >> $RESULTFILE
sort -n -k7 -t\| -o $RESULTFILE $RESULTFILE
echo -e " size \t| 1 | 8 | 16 | 32 | 64 | 1536 | long | config | implementation\n-------:|------:|------:|------:|------:|------:|------:|------:|:--------------:|:---------------\n$(cat $RESULTFILE)" > $RESULTFILE
for i in getcycles*${ALG}v12*; do
echo $i:
echo "| 1 | 8 | 16 | 32 | 64 | 1536 | long |"
echo "|------:|------:|------:|------:|------:|------:|------:|"
for n in $(seq 5); do
./$i $FACTOR | tail -n 1
done | sort -n -k7 -t\|
done 2>/dev/null
#include "api.h"
#include "ascon.h"
#include "crypto_aead.h"
#include "permutations.h"
#include "printstate.h"
#define AVX512_SHUFFLE_U64BIG \
_mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, /* word 7 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 6 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 5 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 4 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 3 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 2 */ \
8, 9, 10, 11, 12, 13, 14, 15, /* word 1 */ \
0, 1, 2, 3, 4, 5, 6, 7) /* word 0 */
#undef forceinline
#define forceinline
forceinline void ascon_loadkey(word_t* K0, word_t* K1, word_t* K2,
const uint8_t* k) {
KINIT(K0, K1, K2);
if (CRYPTO_KEYBYTES == 20) {
*K0 = XOR(*K0, KEYROT(WORD_T(0), LOAD(k, 4)));
k += 4;
*K1 = XOR(*K1, LOAD(k, 8));
*K2 = XOR(*K2, LOAD(k + 8, 8));
forceinline void ascon_init(state_t* s, const uint8_t* npub, const uint8_t* k) {
/* load nonce */
word_t N0 = LOAD(npub, 8);
word_t N1 = LOAD(npub + 8, 8);
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* initialize */
s->x0 = XOR(s->x0, ASCON_128_IV);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16)
s->x0 = XOR(s->x0, ASCON_128A_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, ASCON_80PQ_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, K0);
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
s->x3 = XOR(s->x3, N0);
s->x4 = XOR(s->x4, N1);
P(s, 12);
if (CRYPTO_KEYBYTES == 20) s->x2 = XOR(s->x2, K0);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("initialization", s);
forceinline void ascon_adata(state_t* s, const uint8_t* ad, uint64_t adlen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t;
if (adlen) {
/* full associated data blocks */
while (adlen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, ad);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
P(&r, nr);
adlen -= ASCON_RATE;
*s = r;
/* final associated data block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && adlen >= 8) {
s->x0 = XOR(s->x0, LOAD(ad, 8));
px = &s->x1;
ad += 8;
adlen -= 8;
*px = XOR(*px, PAD(adlen));
if (adlen) *px = XOR(*px, LOAD(ad, adlen));
P(s, nr);
/* domain separation */
s->x4 = XOR(s->x4, WORD_T(1));
printstate("process associated data", s);
forceinline void ascon_encrypt(state_t* s, uint8_t* c, const uint8_t* m,
uint64_t mlen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t;
/* full plaintext blocks */
while (mlen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, m);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
t.z = _mm512_maskz_shuffle_epi8(mask, r.z, u64big);
_mm512_mask_storeu_epi8(c, mask, t.z);
P(&r, nr);
mlen -= ASCON_RATE;
*s = r;
/* final plaintext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && mlen >= 8) {
s->x0 = XOR(s->x0, LOAD(m, 8));
STORE(c, s->x0, 8);
px = &s->x1;
m += 8;
c += 8;
mlen -= 8;
*px = XOR(*px, PAD(mlen));
if (mlen) {
*px = XOR(*px, LOAD(m, mlen));
STORE(c, *px, mlen);
printstate("process plaintext", s);
forceinline void ascon_decrypt(state_t* s, uint8_t* m, const uint8_t* c,
uint64_t clen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t, u;
/* full ciphertext blocks */
while (clen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, c);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
u.z = _mm512_maskz_shuffle_epi8(mask, r.z, u64big);
r.z = _mm512_mask_blend_epi8(mask, r.z, t.z);
_mm512_mask_storeu_epi8(m, mask, u.z);
P(&r, nr);
clen -= ASCON_RATE;
*s = r;
/* final ciphertext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && clen >= 8) {
word_t cx = LOAD(c, 8);
s->x0 = XOR(s->x0, cx);
STORE(m, s->x0, 8);
s->x0 = cx;
px = &s->x1;
m += 8;
c += 8;
clen -= 8;
*px = XOR(*px, PAD(clen));
if (clen) {
word_t cx = LOAD(c, clen);
*px = XOR(*px, cx);
STORE(m, *px, clen);
*px = CLEAR(*px, clen);
*px = XOR(*px, cx);
printstate("process ciphertext", s);
forceinline void ascon_final(state_t* s, const uint8_t* k) {
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* finalize */
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 8) {
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16) {
s->x2 = XOR(s->x2, K1);
s->x3 = XOR(s->x3, K2);
if (CRYPTO_KEYBYTES == 20) {
s->x1 = XOR(s->x1, KEYROT(K0, K1));
s->x2 = XOR(s->x2, KEYROT(K1, K2));
s->x3 = XOR(s->x3, KEYROT(K2, WORD_T(0)));
P(s, 12);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("finalization", s);
int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
const unsigned char* m, unsigned long long mlen,
const unsigned char* ad, unsigned long long adlen,
const unsigned char* nsec, const unsigned char* npub,
const unsigned char* k) {
state_t s;
*clen = mlen + CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_encrypt(&s, c, m, mlen);
ascon_final(&s, k);
/* set tag */
STOREBYTES(c + mlen, s.x3, 8);
STOREBYTES(c + mlen + 8, s.x4, 8);
return 0;
int crypto_aead_decrypt(unsigned char* m, unsigned long long* mlen,
unsigned char* nsec, const unsigned char* c,
unsigned long long clen, const unsigned char* ad,
unsigned long long adlen, const unsigned char* npub,
const unsigned char* k) {
state_t s;
if (clen < CRYPTO_ABYTES) return -1;
*mlen = clen = clen - CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_decrypt(&s, m, c, clen);
ascon_final(&s, k);
/* verify tag (should be constant time, check compiler output) */
s.x3 = XOR(s.x3, LOADBYTES(c + clen, 8));
s.x4 = XOR(s.x4, LOADBYTES(c + clen + 8, 8));
return NOTZERO(s.x3, s.x4);
#define CRYPTO_VERSION "1.2.4"
#define CRYPTO_ABYTES 16
#define ASCON_RATE 16
#ifndef ASCON_H_
#define ASCON_H_
#include <immintrin.h>
#include <stdint.h>
#include "word.h"
typedef union {
__m512i z;
struct {
word_t x0, x1, x2, x3, x4, x5, x6, x7;
} state_t;
void ascon_init(state_t* s, const uint8_t* npub, const uint8_t* k);
void ascon_adata(state_t* s, const uint8_t* ad, uint64_t adlen);
void ascon_encrypt(state_t* s, uint8_t* c, const uint8_t* m, uint64_t mlen);
void ascon_decrypt(state_t* s, uint8_t* m, const uint8_t* c, uint64_t clen);
void ascon_final(state_t* s, const uint8_t* k);
#endif /* ASCON_H */
#ifndef CONFIG_H_
#define CONFIG_H_
/* inline the ascon mode */
/* inline all permutations */
/* unroll permutation loops */
#endif /* CONFIG_H_ */
#ifndef ENDIAN_H_
#define ENDIAN_H_
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
/* macros for big endian machines */
#pragma message("Using macros for big endian machines")
#define U64BIG(x) (x)
#define U32BIG(x) (x)
#define U16BIG(x) (x)
#elif defined(_MSC_VER) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
/* macros for little endian machines */
#pragma message("Using macros for little endian machines")
#define U64BIG(x) \
(((0x00000000000000FFULL & (x)) << 56) | \
((0x000000000000FF00ULL & (x)) << 40) | \
((0x0000000000FF0000ULL & (x)) << 24) | \
((0x00000000FF000000ULL & (x)) << 8) | \
((0x000000FF00000000ULL & (x)) >> 8) | \
((0x0000FF0000000000ULL & (x)) >> 24) | \
((0x00FF000000000000ULL & (x)) >> 40) | \
((0xFF00000000000000ULL & (x)) >> 56))
#define U32BIG(x) \
(((0x000000FF & (x)) << 24) | ((0x0000FF00 & (x)) << 8) | \
((0x00FF0000 & (x)) >> 8) | ((0xFF000000 & (x)) >> 24))
#define U16BIG(x) (((0x00FF & (x)) << 8) | ((0xFF00 & (x)) >> 8))
#error "Ascon byte order macros not defined in endian.h"
#endif /* ENDIAN_H_ */
/* define forceinline macro */
#ifdef _MSC_VER
#define forceinline __forceinline
#elif defined(__GNUC__)
#define forceinline inline __attribute__((__always_inline__))
#elif defined(__CLANG__)
#if __has_attribute(__always_inline__)
#define forceinline inline __attribute__((__always_inline__))
#define forceinline inline
#define forceinline inline
#endif /* FORCEINLINE_H_ */
Branches reviewed 2020-11-13 by Martin Schläffer.
Addresses reviewed 2020-11-13 by Martin Schläffer.
#include "permutations.h"
void P12(state_t* s) { P12ROUNDS(s); }
void P8(state_t* s) { P8ROUNDS(s); }
void P6(state_t* s) { P6ROUNDS(s); }
void P(state_t* s, int nr) { PROUNDS(s, nr); }
#include <stdint.h>
#include "api.h"
#include "ascon.h"
#include "config.h"
#include "printstate.h"
#include "round.h"
#define ASCON_128_KEYBYTES 16
#define ASCON_128A_KEYBYTES 16
#define ASCON_80PQ_KEYBYTES 20
#define ASCON_128_RATE 8
#define ASCON_128A_RATE 16
#define ASCON_128_PA_ROUNDS 12
#define ASCON_128_PB_ROUNDS 6
#define ASCON_128A_PB_ROUNDS 8
#define ASCON_128_IV WORD_T(0x80400c0600000000)
#define ASCON_128A_IV WORD_T(0x80800c0800000000)
#define ASCON_80PQ_IV WORD_T(0xa0400c0600000000)
#define ASCON_HASH_IV WORD_T(0x00400c0000000100)
#define ASCON_XOF_IV WORD_T(0x00400c0000000000)
#define ASCON_HASH_IV0 WORD_T(0xee9398aadb67f03dull)
#define ASCON_HASH_IV1 WORD_T(0x8bb21831c60f1002ull)
#define ASCON_HASH_IV2 WORD_T(0xb48a92db98d5da62ull)
#define ASCON_HASH_IV3 WORD_T(0x43189921b8f8e3e8ull)
#define ASCON_HASH_IV4 WORD_T(0x348fa5c9d525e140ull)
#define ASCON_XOF_IV0 WORD_T(0xb57e273b814cd416ull)
#define ASCON_XOF_IV1 WORD_T(0x2b51042562ae2420ull)
#define ASCON_XOF_IV2 WORD_T(0x66a3a7768ddf2218ull)
#define ASCON_XOF_IV3 WORD_T(0x5aad0a7a8153650cull)
#define ASCON_XOF_IV4 WORD_T(0x4f3e0e32539493b6ull)
#define START(n) ((3 + (n)) << 4 | (12 - (n)))
#define RC(c) WORD_T(c)
forceinline void P12ROUNDS(state_t* s) {
ROUND(s, RC(0xf0));
ROUND(s, RC(0xe1));
ROUND(s, RC(0xd2));
ROUND(s, RC(0xc3));
ROUND(s, RC(0xb4));
ROUND(s, RC(0xa5));
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
forceinline void P8ROUNDS(state_t* s) {
ROUND(s, RC(0xb4));
ROUND(s, RC(0xa5));
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
forceinline void P6ROUNDS(state_t* s) {
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
forceinline void PROUNDS(state_t* s, int nr) {
for (int i = START(nr); i > 0x4a; i -= 0x0f) ROUND(s, RC(i));
forceinline void P(state_t* s, int nr) {
if (nr == 12) P12ROUNDS(s);
if (nr == 8) P8ROUNDS(s);
if (nr == 6) P6ROUNDS(s);
void P12(state_t* s);
void P8(state_t* s);
void P6(state_t* s);
forceinline void P(state_t* s, int nr) {
if (nr == 12) P12(s);
if (nr == 8) P8(s);
if (nr == 6) P6(s);
forceinline void P(state_t* s, int nr) { PROUNDS(s, nr); }
void P(state_t* s, int nr);
#endif /* PERMUTATIONS_H_ */
#include "printstate.h"
#include <inttypes.h>
#include <stdio.h>
void printword(const char* text, const word_t x) {
printf("%s=%016" PRIx64 "\n", text, WORDTOU64(x));
void printstate(const char* text, const state_t* s) {
printf("%s:\n", text);
printword(" x0", s->x0);
printword(" x1", s->x1);
printword(" x2", s->x2);
printword(" x3", s->x3);
printword(" x4", s->x4);
#include "ascon.h"
#include "word.h"
void printword(const char* text, const word_t x);
void printstate(const char* text, const state_t* s);
#define printword(text, w) \
do { \
} while (0)
#define printstate(text, s) \
do { \
} while (0)
#endif /* PRINTSTATE_H_ */
#ifndef ROUND_H_
#define ROUND_H_
#include "ascon.h"
#include "printstate.h"
forceinline void KINIT(word_t* K0, word_t* K1, word_t* K2) {
*K0 = WORD_T(0);
*K1 = WORD_T(0);
*K2 = WORD_T(0);
forceinline void PINIT(state_t* s) {
s->x0 = WORD_T(0);
s->x1 = WORD_T(0);
s->x2 = WORD_T(0);
s->x3 = WORD_T(0);
s->x4 = WORD_T(0);
forceinline void ROUND(state_t* s, word_t C) {
uint64_t x = 0;
__mmask8 mxor1 = 0x15;
__mmask8 mxor2 = 0x0b;
__m512i pxor1 = _mm512_set_epi64(x, x, x, 3, x, 1, x, 4);
__m512i pxor2 = _mm512_set_epi64(x, x, x, x, 2, x, 0, 4);
__m512i c = _mm512_set_epi64(x, x, x, 0, 0, C, 0, 0);
__m512i n = _mm512_set_epi64(x, x, x, 0, 0, ~0ull, 0, 0);
__m512i pchi1 = _mm512_set_epi64(x, x, x, 0, 4, 3, 2, 1);
__m512i pchi2 = _mm512_set_epi64(x, x, x, 1, 0, 4, 3, 2);
__m512i rot1 = _mm512_set_epi64(x, x, x, 7, 10, 1, 61, 19);
__m512i rot2 = _mm512_set_epi64(x, x, x, 41, 17, 6, 39, 28);
__m512i t0, t1, t2;
/* round constant + s-box layer */
t0 = _mm512_maskz_permutexvar_epi64(mxor1, pxor1, s->z);
t0 = _mm512_ternarylogic_epi64(s->z, t0, c, 0x96);
/* keccak s-box start */
t1 = _mm512_permutexvar_epi64(pchi1, t0);
t2 = _mm512_permutexvar_epi64(pchi2, t0);
t0 = _mm512_ternarylogic_epi64(t0, t1, t2, 0xd2);
/* keccak s-box end */
t1 = _mm512_maskz_permutexvar_epi64(mxor2, pxor2, t0);
t0 = _mm512_ternarylogic_epi64(t0, t1, n, 0x96);
/* linear layer */
t1 = _mm512_rorv_epi64(t0, rot1);
t2 = _mm512_rorv_epi64(t0, rot2);
s->z = _mm512_ternarylogic_epi64(t0, t1, t2, 0x96);
printstate(" round output", s);
#endif /* ROUND_H_ */
#ifndef WORD_H_
#define WORD_H_
#include <stdint.h>
#include "endian.h"
#include "forceinline.h"
typedef uint64_t word_t;
#define WORD_T
#define UINT64_T
#define U64TOWORD
#define WORDTOU64
forceinline word_t ROR(word_t x, int n) { return x >> n | x << (64 - n); }
forceinline word_t NOT(word_t a) { return ~a; }
forceinline word_t XOR(word_t a, word_t b) { return a ^ b; }
forceinline word_t AND(word_t a, word_t b) { return a & b; }
forceinline word_t KEYROT(word_t lo2hi, word_t hi2lo) {
return lo2hi << 32 | hi2lo >> 32;
forceinline int NOTZERO(word_t a, word_t b) {
uint64_t result = a | b;
result |= result >> 32;
result |= result >> 16;
result |= result >> 8;
return ((((int)(result & 0xff) - 1) >> 8) & 1) - 1;
forceinline word_t PAD(int i) { return 0x80ull << (56 - 8 * i); }
forceinline word_t CLEAR(word_t w, int n) {
/* undefined for n == 0 */
uint64_t mask = 0x00ffffffffffffffull >> (n * 8 - 8);
return w & mask;
forceinline uint64_t MASK(int n) {
/* undefined for n == 0 */
return ~0ull >> (64 - 8 * n);
forceinline word_t LOAD(const uint8_t* bytes, int n) {
uint64_t x = *(uint64_t*)bytes & MASK(n);
return U64BIG(x);
forceinline void STORE(uint8_t* bytes, word_t w, int n) {
*(uint64_t*)bytes &= ~MASK(n);
*(uint64_t*)bytes |= U64BIG(w);
forceinline word_t LOADBYTES(const uint8_t* bytes, int n) {
uint64_t x = 0;
for (int i = 0; i < n; ++i) ((uint8_t*)&x)[7 - i] = bytes[i];
return x;
forceinline void STOREBYTES(uint8_t* bytes, word_t w, int n) {
for (int i = 0; i < n; ++i) bytes[i] = ((uint8_t*)&w)[7 - i];
#endif /* WORD_H_ */
#include "api.h"
#include "ascon.h"
#include "permutations.h"
#include "printstate.h"
void ascon_update(state_t* s, uint8_t* out, const uint8_t* in, uint64_t len,
uint8_t mode) {
const int nr = (ASCON_RATE == 8) ? 6 : 8;
word_t tmp0, tmp1;
/* full blocks */
while (len >= ASCON_RATE) {
tmp0 = LOAD(in, 8);
tmp1 = LOAD(in + 8, 8);
s->x0 = XOR(s->x0, tmp0);
s->x1 = XOR(s->x1, tmp1);
if (mode & ASCON_SQUEEZE) {
STORE(out, s->x0, 8);
STORE(out + 8, s->x1, 8);
if (mode & ASCON_INSERT) {
s->x0 = tmp0;
s->x1 = tmp1;
P(s, nr);
out += ASCON_RATE;
len -= ASCON_RATE;
/* final block */
if (len) {
tmp1 = WORD_T(0);
if (len >= 8) tmp0 = LOAD(in, 8);
if (len > 8)
tmp1 = LOAD(in + 8, len - 8);
tmp0 = LOAD(in, len);
s->x0 = XOR(s->x0, tmp0);
s->x1 = XOR(s->x1, tmp1);
if (mode & ASCON_SQUEEZE) {
if (len >= 8) STORE(out, s->x0, 8);
if (len > 8)
STORE(out + 8, s->x1, len - 8);
STORE(out, s->x0, len);
if (mode & ASCON_INSERT) {
if (len >= 8) s->x0 = tmp0;
if (len > 8) {
s->x1 = CLEAR(s->x1, len - 8);
s->x1 = XOR(s->x1, tmp1);
} else {
s->x0 = CLEAR(s->x0, len);
s->x0 = XOR(s->x0, tmp0);
if (len < 8)
s->x0 = XOR(s->x0, PAD(len % 8));
s->x1 = XOR(s->x1, PAD(len % 8));
#include "api.h"
#include "ascon.h"
#include "crypto_aead.h"
#include "permutations.h"
#include "printstate.h"
#define AD() \
do { \
uint32_t adlen_hi = (uint32_t)(adlen >> 32); \
uint32_t adlen_lo = (uint32_t)adlen; \
__asm__ __volatile__ ( \
".arm \n\t" \
".fpu neon \n\t" \
"cmp %[adlen_hi], #0 \n\t" \
"cmpeq %[adlen_lo], #15 \n\t" \
"bls .LAD1 \n\t" \
"vldm %[s], {d0-d4} \n\t" \
".LAD0: \n\t" \
"vldm %[ad]!, {d16,d17} \n\t" \
"vrev64.8 q8, q8 \n\t" \
"veor q0, q0, q8 \n\t" \
"vmvn d2, d2 \n\t" \
"vmvn d2, d2 \n\t" \
"subs %[adlen_lo], %[adlen_lo], #16 \n\t" \
"sbc %[adlen_hi], %[adlen_hi], #0 \n\t" \
"cmp %[adlen_hi], #0 \n\t" \
"cmpeq %[adlen_lo], #15 \n\t" \
"bhi .LAD0 \n\t" \
"vstm %[s], {d0-d4} \n\t" \
".LAD1: \n\t" \
: [adlen_hi] "+r" (adlen_hi), [adlen_lo] "+r" (adlen_lo), \
[ad] "+r" (ad) \
: [s] "r" (s), [C] "r" (C) \
: "d0", "d1", "d2", "d3", "d4", \
"d10", "d11", "d12", "d13", "d14", "d16", "d17", \
"d20", "d21", "d22", "d23", "d24", \
"d31", "memory"); \
adlen = (uint64_t)adlen_hi << 32 | adlen_lo; \
} while (0)
#define PT() \
do { \
uint32_t mlen_hi = (uint32_t)(mlen >> 32); \
uint32_t mlen_lo = (uint32_t)mlen; \
__asm__ __volatile__ ( \
".arm \n\t" \
".fpu neon \n\t" \
"cmp %[mlen_hi], #0 \n\t" \
"cmpeq %[mlen_lo], #15 \n\t" \
"bls .LPT1 \n\t" \
"vldm %[s], {d0-d4} \n\t" \
".LPT0: \n\t" \
"vldm %[m]!, {d16,d17} \n\t" \
"vrev64.8 q8, q8 \n\t" \
"veor q0, q0, q8 \n\t" \
"vrev64.8 q13, q0 \n\t" \
"vstm %[c]!, {d26,d27} \n\t" \
"vmvn d2, d2 \n\t" \
"vmvn d2, d2 \n\t" \
"subs %[mlen_lo], %[mlen_lo], #16 \n\t" \
"sbc %[mlen_hi], %[mlen_hi], #0 \n\t" \
"cmp %[mlen_hi], #0 \n\t" \
"cmpeq %[mlen_lo], #15 \n\t" \
"bhi .LPT0 \n\t" \
"vstm %[s], {d0-d4} \n\t" \
".LPT1: \n\t" \
: [mlen_hi] "+r" (mlen_hi), [mlen_lo] "+r" (mlen_lo), \
[m] "+r" (m), [c] "+r" (c) \
: [s] "r" (s), [C] "r" (C) \
: "d0", "d1", "d2", "d3", "d4", \
"d10", "d11", "d12", "d13", "d14", "d16", "d17", \
"d20", "d21", "d22", "d23", "d24", "d26", "d27", \
"d31", "memory"); \
mlen = (uint64_t)mlen_hi << 32 | mlen_lo; \
} while (0)
#define CT() \
do { \
uint32_t clen_hi = (uint32_t)(clen >> 32); \
uint32_t clen_lo = (uint32_t)clen; \
__asm__ __volatile__ ( \
".arm \n\t" \
".fpu neon \n\t" \
"cmp %[clen_hi], #0 \n\t" \
"cmpeq %[clen_lo], #15 \n\t" \
"bls .LCT1 \n\t" \
"vldm %[s], {d0-d4} \n\t" \
".LCT0: \n\t" \
"vldm %[c]!, {d26,d27} \n\t" \
"vrev64.8 q8, q0 \n\t" \
"veor q8, q8, q13 \n\t" \
"vrev64.8 q0, q13 \n\t" \
"vstm %[m]!, {d16,d17} \n\t" \
"vmvn d2, d2 \n\t" \
"vmvn d2, d2 \n\t" \
"subs %[clen_lo], %[clen_lo], #16 \n\t" \
"sbc %[clen_hi], %[clen_hi], #0 \n\t" \
"cmp %[clen_hi], #0 \n\t" \
"cmpeq %[clen_lo], #15 \n\t" \
"bhi .LCT0 \n\t" \
"vstm %[s], {d0-d4} \n\t" \
".LCT1: \n\t" \
: [clen_hi] "+r" (clen_hi), [clen_lo] "+r" (clen_lo), \
[m] "+r" (m), [c] "+r" (c) \
: [s] "r" (s), [C] "r" (C) \
: "d0", "d1", "d2", "d3", "d4", \
"d10", "d11", "d12", "d13", "d14", "d16", "d17", \
"d20", "d21", "d22", "d23", "d24", "d26", "d27", \
"d31", "memory"); \
clen = (uint64_t)clen_hi << 32 | clen_lo; \
} while (0)
#undef forceinline
#define forceinline
forceinline void ascon_loadkey(word_t* K0, word_t* K1, word_t* K2,
const uint8_t* k) {
KINIT(K0, K1, K2);
if (CRYPTO_KEYBYTES == 20) {
*K0 = XOR(*K0, KEYROT(WORD_T(0), LOAD(k, 4)));
k += 4;
*K1 = XOR(*K1, LOAD(k, 8));
*K2 = XOR(*K2, LOAD(k + 8, 8));
forceinline void ascon_init(state_t* s, const uint8_t* npub, const uint8_t* k) {
/* load nonce */
word_t N0 = LOAD(npub, 8);
word_t N1 = LOAD(npub + 8, 8);
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* initialize */
s->x0 = XOR(s->x0, ASCON_128_IV);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16)
s->x0 = XOR(s->x0, ASCON_128A_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, ASCON_80PQ_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, K0);
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
s->x3 = XOR(s->x3, N0);
s->x4 = XOR(s->x4, N1);
P(s, 12);
if (CRYPTO_KEYBYTES == 20) s->x2 = XOR(s->x2, K0);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("initialization", s);
forceinline void ascon_adata(state_t* s, const uint8_t* ad, uint64_t adlen) {
const int nr = (ASCON_RATE == 8) ? 6 : 8;
if (adlen) {
/* full associated data blocks */
/* final associated data block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && adlen >= 8) {
s->x0 = XOR(s->x0, LOAD(ad, 8));
px = &s->x1;
ad += 8;
adlen -= 8;
*px = XOR(*px, PAD(adlen));
if (adlen) *px = XOR(*px, LOAD(ad, adlen));
P(s, nr);
/* domain separation */
s->x4 = XOR(s->x4, WORD_T(1));
printstate("process associated data", s);
forceinline void ascon_encrypt(state_t* s, uint8_t* c, const uint8_t* m,
uint64_t mlen) {
const int nr = (ASCON_RATE == 8) ? 6 : 8;
/* full plaintext blocks */
/* final plaintext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && mlen >= 8) {
s->x0 = XOR(s->x0, LOAD(m, 8));
STORE(c, s->x0, 8);
px = &s->x1;
m += 8;
c += 8;
mlen -= 8;
*px = XOR(*px, PAD(mlen));
if (mlen) {
*px = XOR(*px, LOAD(m, mlen));
STORE(c, *px, mlen);
printstate("process plaintext", s);
forceinline void ascon_decrypt(state_t* s, uint8_t* m, const uint8_t* c,
uint64_t clen) {
const int nr = (ASCON_RATE == 8) ? 6 : 8;
/* full ciphertext blocks */
/* final ciphertext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && clen >= 8) {
word_t cx = LOAD(c, 8);
s->x0 = XOR(s->x0, cx);
STORE(m, s->x0, 8);
s->x0 = cx;
px = &s->x1;
m += 8;
c += 8;
clen -= 8;
*px = XOR(*px, PAD(clen));
if (clen) {
word_t cx = LOAD(c, clen);
*px = XOR(*px, cx);
STORE(m, *px, clen);
*px = CLEAR(*px, clen);
*px = XOR(*px, cx);
printstate("process ciphertext", s);
forceinline void ascon_final(state_t* s, const uint8_t* k) {
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* finalize */
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 8) {
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16) {
s->x2 = XOR(s->x2, K1);
s->x3 = XOR(s->x3, K2);
if (CRYPTO_KEYBYTES == 20) {
s->x1 = XOR(s->x1, KEYROT(K0, K1));
s->x2 = XOR(s->x2, KEYROT(K1, K2));
s->x3 = XOR(s->x3, KEYROT(K2, WORD_T(0)));
P(s, 12);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("finalization", s);
int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
const unsigned char* m, unsigned long long mlen,
const unsigned char* ad, unsigned long long adlen,
const unsigned char* nsec, const unsigned char* npub,
const unsigned char* k) {
state_t s;
*clen = mlen + CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_encrypt(&s, c, m, mlen);
ascon_final(&s, k);
/* set tag */
STOREBYTES(c + mlen, s.x3, 8);
STOREBYTES(c + mlen + 8, s.x4, 8);
return 0;
int crypto_aead_decrypt(unsigned char* m, unsigned long long* mlen,
unsigned char* nsec, const unsigned char* c,
unsigned long long clen, const unsigned char* ad,
unsigned long long adlen, const unsigned char* npub,
const unsigned char* k) {
state_t s;
if (clen < CRYPTO_ABYTES) return -1;
*mlen = clen = clen - CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_decrypt(&s, m, c, clen);
ascon_final(&s, k);
/* verify tag (should be constant time, check compiler output) */
s.x3 = XOR(s.x3, LOADBYTES(c + clen, 8));
s.x4 = XOR(s.x4, LOADBYTES(c + clen + 8, 8));
return NOTZERO(s.x3, s.x4);
#include "ascon.h"
#include "word.h"
void printword(const char* text, const word_t x);
void printstate(const char* text, const state_t* s);
#define printword(text, w) \
do { \
} while (0)
#define printstate(text, s) \
do { \
} while (0)
#endif /* PRINTSTATE_H_ */
#include "api.h"
#include "ascon.h"
#include "permutations.h"
#include "printstate.h"
void ascon_update(state_t* s, uint8_t* out, const uint8_t* in, uint64_t len,
uint8_t mode) {
const int nr = (ASCON_RATE == 8) ? 6 : 8;
word_t tmp0, tmp1;
/* full blocks */
while (len >= ASCON_RATE) {
tmp0 = LOAD(in, 8);
tmp1 = LOAD(in + 8, 8);
s->x0 = XOR(s->x0, tmp0);
s->x1 = XOR(s->x1, tmp1);
if (mode & ASCON_SQUEEZE) {
STORE(out, s->x0, 8);
STORE(out + 8, s->x1, 8);
if (mode & ASCON_INSERT) {
s->x0 = tmp0;
s->x1 = tmp1;
P(s, nr);
out += ASCON_RATE;
len -= ASCON_RATE;
/* final block */
if (len) {
tmp1 = WORD_T(0);
if (len >= 8) tmp0 = LOAD(in, 8);
if (len > 8)
tmp1 = LOAD(in + 8, len - 8);
tmp0 = LOAD(in, len);
s->x0 = XOR(s->x0, tmp0);
s->x1 = XOR(s->x1, tmp1);
if (mode & ASCON_SQUEEZE) {
if (len >= 8) STORE(out, s->x0, 8);
if (len > 8)
STORE(out + 8, s->x1, len - 8);
STORE(out, s->x0, len);
if (mode & ASCON_INSERT) {
if (len >= 8) s->x0 = tmp0;
if (len > 8) {
s->x1 = CLEAR(s->x1, len - 8);
s->x1 = XOR(s->x1, tmp1);
} else {
s->x0 = CLEAR(s->x0, len);
s->x0 = XOR(s->x0, tmp0);
if (len < 8)
s->x0 = XOR(s->x0, PAD(len % 8));
s->x1 = XOR(s->x1, PAD(len % 8));
#include "api.h"
#include "ascon.h"
#include "crypto_aead.h"
#include "permutations.h"
#include "printstate.h"
#define AVX512_SHUFFLE_U64BIG \
_mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, /* word 7 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 6 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 5 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 4 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 3 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 2 */ \
8, 9, 10, 11, 12, 13, 14, 15, /* word 1 */ \
0, 1, 2, 3, 4, 5, 6, 7) /* word 0 */
#undef forceinline
#define forceinline
forceinline void ascon_loadkey(word_t* K0, word_t* K1, word_t* K2,
const uint8_t* k) {
KINIT(K0, K1, K2);
if (CRYPTO_KEYBYTES == 20) {
*K0 = XOR(*K0, KEYROT(WORD_T(0), LOAD(k, 4)));
k += 4;
*K1 = XOR(*K1, LOAD(k, 8));
*K2 = XOR(*K2, LOAD(k + 8, 8));
forceinline void ascon_init(state_t* s, const uint8_t* npub, const uint8_t* k) {
/* load nonce */
word_t N0 = LOAD(npub, 8);
word_t N1 = LOAD(npub + 8, 8);
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* initialize */
s->x0 = XOR(s->x0, ASCON_128_IV);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16)
s->x0 = XOR(s->x0, ASCON_128A_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, ASCON_80PQ_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, K0);
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
s->x3 = XOR(s->x3, N0);
s->x4 = XOR(s->x4, N1);
P(s, 12);
if (CRYPTO_KEYBYTES == 20) s->x2 = XOR(s->x2, K0);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("initialization", s);
forceinline void ascon_adata(state_t* s, const uint8_t* ad, uint64_t adlen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t;
if (adlen) {
/* full associated data blocks */
while (adlen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, ad);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
P(&r, nr);
adlen -= ASCON_RATE;
*s = r;
/* final associated data block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && adlen >= 8) {
s->x0 = XOR(s->x0, LOAD(ad, 8));
px = &s->x1;
ad += 8;
adlen -= 8;
*px = XOR(*px, PAD(adlen));
if (adlen) *px = XOR(*px, LOAD(ad, adlen));
P(s, nr);
/* domain separation */
s->x4 = XOR(s->x4, WORD_T(1));
printstate("process associated data", s);
forceinline void ascon_encrypt(state_t* s, uint8_t* c, const uint8_t* m,
uint64_t mlen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t;
/* full plaintext blocks */
while (mlen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, m);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
t.z = _mm512_maskz_shuffle_epi8(mask, r.z, u64big);
_mm512_mask_storeu_epi8(c, mask, t.z);
P(&r, nr);
mlen -= ASCON_RATE;
*s = r;
/* final plaintext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && mlen >= 8) {
s->x0 = XOR(s->x0, LOAD(m, 8));
STORE(c, s->x0, 8);
px = &s->x1;
m += 8;
c += 8;
mlen -= 8;
*px = XOR(*px, PAD(mlen));
if (mlen) {
*px = XOR(*px, LOAD(m, mlen));
STORE(c, *px, mlen);
printstate("process plaintext", s);
forceinline void ascon_decrypt(state_t* s, uint8_t* m, const uint8_t* c,
uint64_t clen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t, u;
/* full ciphertext blocks */
while (clen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, c);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
u.z = _mm512_maskz_shuffle_epi8(mask, r.z, u64big);
r.z = _mm512_mask_blend_epi8(mask, r.z, t.z);
_mm512_mask_storeu_epi8(m, mask, u.z);
P(&r, nr);
clen -= ASCON_RATE;
*s = r;
/* final ciphertext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && clen >= 8) {
word_t cx = LOAD(c, 8);
s->x0 = XOR(s->x0, cx);
STORE(m, s->x0, 8);
s->x0 = cx;
px = &s->x1;
m += 8;
c += 8;
clen -= 8;
*px = XOR(*px, PAD(clen));
if (clen) {
word_t cx = LOAD(c, clen);
*px = XOR(*px, cx);
STORE(m, *px, clen);
*px = CLEAR(*px, clen);
*px = XOR(*px, cx);
printstate("process ciphertext", s);
forceinline void ascon_final(state_t* s, const uint8_t* k) {
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* finalize */
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 8) {
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16) {
s->x2 = XOR(s->x2, K1);
s->x3 = XOR(s->x3, K2);
if (CRYPTO_KEYBYTES == 20) {
s->x1 = XOR(s->x1, KEYROT(K0, K1));
s->x2 = XOR(s->x2, KEYROT(K1, K2));
s->x3 = XOR(s->x3, KEYROT(K2, WORD_T(0)));
P(s, 12);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("finalization", s);
int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
const unsigned char* m, unsigned long long mlen,
const unsigned char* ad, unsigned long long adlen,
const unsigned char* nsec, const unsigned char* npub,
const unsigned char* k) {
state_t s;
*clen = mlen + CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_encrypt(&s, c, m, mlen);
ascon_final(&s, k);
/* set tag */
STOREBYTES(c + mlen, s.x3, 8);
STOREBYTES(c + mlen + 8, s.x4, 8);
return 0;
int crypto_aead_decrypt(unsigned char* m, unsigned long long* mlen,
unsigned char* nsec, const unsigned char* c,
unsigned long long clen, const unsigned char* ad,
unsigned long long adlen, const unsigned char* npub,
const unsigned char* k) {
state_t s;
if (clen < CRYPTO_ABYTES) return -1;
*mlen = clen = clen - CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_decrypt(&s, m, c, clen);
ascon_final(&s, k);
/* verify tag (should be constant time, check compiler output) */
s.x3 = XOR(s.x3, LOADBYTES(c + clen, 8));
s.x4 = XOR(s.x4, LOADBYTES(c + clen + 8, 8));
return NOTZERO(s.x3, s.x4);
#define CRYPTO_VERSION "1.2.4"
#define CRYPTO_ABYTES 16
#define ASCON_RATE 8
#ifndef ASCON_H_
#define ASCON_H_
#include <immintrin.h>
#include <stdint.h>
#include "word.h"
typedef union {
__m512i z;
struct {
word_t x0, x1, x2, x3, x4, x5, x6, x7;
} state_t;
void ascon_init(state_t* s, const uint8_t* npub, const uint8_t* k);
void ascon_adata(state_t* s, const uint8_t* ad, uint64_t adlen);
void ascon_encrypt(state_t* s, uint8_t* c, const uint8_t* m, uint64_t mlen);
void ascon_decrypt(state_t* s, uint8_t* m, const uint8_t* c, uint64_t clen);
void ascon_final(state_t* s, const uint8_t* k);
#endif /* ASCON_H */
#ifndef CONFIG_H_
#define CONFIG_H_
/* inline the ascon mode */
/* inline all permutations */
/* unroll permutation loops */
#endif /* CONFIG_H_ */
#ifndef ENDIAN_H_
#define ENDIAN_H_
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
/* macros for big endian machines */
#pragma message("Using macros for big endian machines")
#define U64BIG(x) (x)
#define U32BIG(x) (x)
#define U16BIG(x) (x)
#elif defined(_MSC_VER) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
/* macros for little endian machines */
#pragma message("Using macros for little endian machines")
#define U64BIG(x) \
(((0x00000000000000FFULL & (x)) << 56) | \
((0x000000000000FF00ULL & (x)) << 40) | \
((0x0000000000FF0000ULL & (x)) << 24) | \
((0x00000000FF000000ULL & (x)) << 8) | \
((0x000000FF00000000ULL & (x)) >> 8) | \
((0x0000FF0000000000ULL & (x)) >> 24) | \
((0x00FF000000000000ULL & (x)) >> 40) | \
((0xFF00000000000000ULL & (x)) >> 56))
#define U32BIG(x) \
(((0x000000FF & (x)) << 24) | ((0x0000FF00 & (x)) << 8) | \
((0x00FF0000 & (x)) >> 8) | ((0xFF000000 & (x)) >> 24))
#define U16BIG(x) (((0x00FF & (x)) << 8) | ((0xFF00 & (x)) >> 8))
#error "Ascon byte order macros not defined in endian.h"
#endif /* ENDIAN_H_ */
#include "permutations.h"
void P12(state_t* s) { P12ROUNDS(s); }
void P8(state_t* s) { P8ROUNDS(s); }
void P6(state_t* s) { P6ROUNDS(s); }
void P(state_t* s, int nr) { PROUNDS(s, nr); }
#include <stdint.h>
#include "api.h"
#include "ascon.h"
#include "config.h"
#include "printstate.h"
#include "round.h"
#define ASCON_128_KEYBYTES 16
#define ASCON_128A_KEYBYTES 16
#define ASCON_80PQ_KEYBYTES 20
#define ASCON_128_RATE 8
#define ASCON_128A_RATE 16
#define ASCON_128_PA_ROUNDS 12
#define ASCON_128_PB_ROUNDS 6
#define ASCON_128A_PB_ROUNDS 8
#define ASCON_128_IV WORD_T(0x80400c0600000000)
#define ASCON_128A_IV WORD_T(0x80800c0800000000)
#define ASCON_80PQ_IV WORD_T(0xa0400c0600000000)
#define ASCON_HASH_IV WORD_T(0x00400c0000000100)
#define ASCON_XOF_IV WORD_T(0x00400c0000000000)
#define ASCON_HASH_IV0 WORD_T(0xee9398aadb67f03dull)
#define ASCON_HASH_IV1 WORD_T(0x8bb21831c60f1002ull)
#define ASCON_HASH_IV2 WORD_T(0xb48a92db98d5da62ull)
#define ASCON_HASH_IV3 WORD_T(0x43189921b8f8e3e8ull)
#define ASCON_HASH_IV4 WORD_T(0x348fa5c9d525e140ull)
#define ASCON_XOF_IV0 WORD_T(0xb57e273b814cd416ull)
#define ASCON_XOF_IV1 WORD_T(0x2b51042562ae2420ull)
#define ASCON_XOF_IV2 WORD_T(0x66a3a7768ddf2218ull)
#define ASCON_XOF_IV3 WORD_T(0x5aad0a7a8153650cull)
#define ASCON_XOF_IV4 WORD_T(0x4f3e0e32539493b6ull)
#define START(n) ((3 + (n)) << 4 | (12 - (n)))
#define RC(c) WORD_T(c)
forceinline void P12ROUNDS(state_t* s) {
ROUND(s, RC(0xf0));
ROUND(s, RC(0xe1));
ROUND(s, RC(0xd2));
ROUND(s, RC(0xc3));
ROUND(s, RC(0xb4));
ROUND(s, RC(0xa5));
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
forceinline void P8ROUNDS(state_t* s) {
ROUND(s, RC(0xb4));
ROUND(s, RC(0xa5));
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
forceinline void P6ROUNDS(state_t* s) {
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
forceinline void PROUNDS(state_t* s, int nr) {
for (int i = START(nr); i > 0x4a; i -= 0x0f) ROUND(s, RC(i));
forceinline void P(state_t* s, int nr) {
if (nr == 12) P12ROUNDS(s);
if (nr == 8) P8ROUNDS(s);
if (nr == 6) P6ROUNDS(s);
void P12(state_t* s);
void P8(state_t* s);
void P6(state_t* s);
forceinline void P(state_t* s, int nr) {
if (nr == 12) P12(s);
if (nr == 8) P8(s);
if (nr == 6) P6(s);
forceinline void P(state_t* s, int nr) { PROUNDS(s, nr); }
void P(state_t* s, int nr);
#endif /* PERMUTATIONS_H_ */
#include "printstate.h"
#include <inttypes.h>
#include <stdio.h>
void printword(const char* text, const word_t x) {
printf("%s=%016" PRIx64 "\n", text, WORDTOU64(x));
void printstate(const char* text, const state_t* s) {
printf("%s:\n", text);
printword(" x0", s->x0);
printword(" x1", s->x1);
printword(" x2", s->x2);
printword(" x3", s->x3);
printword(" x4", s->x4);
#include "ascon.h"
#include "word.h"
void printword(const char* text, const word_t x);
void printstate(const char* text, const state_t* s);
#define printword(text, w) \
do { \
} while (0)
#define printstate(text, s) \
do { \
} while (0)
#endif /* PRINTSTATE_H_ */
#ifndef ROUND_H_
#define ROUND_H_
#include "ascon.h"
#include "printstate.h"
forceinline void KINIT(word_t* K0, word_t* K1, word_t* K2) {
*K0 = WORD_T(0);
*K1 = WORD_T(0);
*K2 = WORD_T(0);
forceinline void PINIT(state_t* s) {
s->x0 = WORD_T(0);
s->x1 = WORD_T(0);
s->x2 = WORD_T(0);
s->x3 = WORD_T(0);
s->x4 = WORD_T(0);
forceinline void ROUND(state_t* s, word_t C) {
uint64_t x = 0;
__mmask8 mxor1 = 0x15;
__mmask8 mxor2 = 0x0b;
__m512i pxor1 = _mm512_set_epi64(x, x, x, 3, x, 1, x, 4);
__m512i pxor2 = _mm512_set_epi64(x, x, x, x, 2, x, 0, 4);
__m512i c = _mm512_set_epi64(x, x, x, 0, 0, C, 0, 0);
__m512i n = _mm512_set_epi64(x, x, x, 0, 0, ~0ull, 0, 0);
__m512i pchi1 = _mm512_set_epi64(x, x, x, 0, 4, 3, 2, 1);
__m512i pchi2 = _mm512_set_epi64(x, x, x, 1, 0, 4, 3, 2);
__m512i rot1 = _mm512_set_epi64(x, x, x, 7, 10, 1, 61, 19);
__m512i rot2 = _mm512_set_epi64(x, x, x, 41, 17, 6, 39, 28);
__m512i t0, t1, t2;
/* round constant + s-box layer */
t0 = _mm512_maskz_permutexvar_epi64(mxor1, pxor1, s->z);
t0 = _mm512_ternarylogic_epi64(s->z, t0, c, 0x96);
/* keccak s-box start */
t1 = _mm512_permutexvar_epi64(pchi1, t0);
t2 = _mm512_permutexvar_epi64(pchi2, t0);
t0 = _mm512_ternarylogic_epi64(t0, t1, t2, 0xd2);
/* keccak s-box end */
t1 = _mm512_maskz_permutexvar_epi64(mxor2, pxor2, t0);
t0 = _mm512_ternarylogic_epi64(t0, t1, n, 0x96);
/* linear layer */
t1 = _mm512_rorv_epi64(t0, rot1);
t2 = _mm512_rorv_epi64(t0, rot2);
s->z = _mm512_ternarylogic_epi64(t0, t1, t2, 0x96);
printstate(" round output", s);
#endif /* ROUND_H_ */
#ifndef WORD_H_
#define WORD_H_
#include <stdint.h>
#include "endian.h"
#include "forceinline.h"
typedef uint64_t word_t;
#define WORD_T
#define UINT64_T
#define U64TOWORD
#define WORDTOU64
forceinline word_t ROR(word_t x, int n) { return x >> n | x << (64 - n); }
forceinline word_t NOT(word_t a) { return ~a; }
forceinline word_t XOR(word_t a, word_t b) { return a ^ b; }
forceinline word_t AND(word_t a, word_t b) { return a & b; }
forceinline word_t KEYROT(word_t lo2hi, word_t hi2lo) {
return lo2hi << 32 | hi2lo >> 32;
forceinline int NOTZERO(word_t a, word_t b) {
uint64_t result = a | b;
result |= result >> 32;
result |= result >> 16;
result |= result >> 8;
return ((((int)(result & 0xff) - 1) >> 8) & 1) - 1;
forceinline word_t PAD(int i) { return 0x80ull << (56 - 8 * i); }
forceinline word_t CLEAR(word_t w, int n) {
/* undefined for n == 0 */
uint64_t mask = 0x00ffffffffffffffull >> (n * 8 - 8);
return w & mask;
forceinline uint64_t MASK(int n) {
/* undefined for n == 0 */
return ~0ull >> (64 - 8 * n);
forceinline word_t LOAD(const uint8_t* bytes, int n) {
uint64_t x = *(uint64_t*)bytes & MASK(n);
return U64BIG(x);
forceinline void STORE(uint8_t* bytes, word_t w, int n) {
*(uint64_t*)bytes &= ~MASK(n);
*(uint64_t*)bytes |= U64BIG(w);
forceinline word_t LOADBYTES(const uint8_t* bytes, int n) {
uint64_t x = 0;
for (int i = 0; i < n; ++i) ((uint8_t*)&x)[7 - i] = bytes[i];
return x;
forceinline void STOREBYTES(uint8_t* bytes, word_t w, int n) {
for (int i = 0; i < n; ++i) bytes[i] = ((uint8_t*)&w)[7 - i];
#endif /* WORD_H_ */
#include "api.h"
#include "ascon.h"
#include "permutations.h"
#include "printstate.h"
void ascon_update(state_t* s, uint8_t* out, const uint8_t* in, uint64_t len,
uint8_t mode) {
const int nr = (ASCON_RATE == 8) ? 6 : 8;
word_t tmp0;
int n = 0;
while (len) {
/* determine block size */
n = len < ASCON_RATE ? len : ASCON_RATE;
/* absorb data */
tmp0 = LOAD(in, n);
s->x0 = XOR(s->x0, tmp0);
/* extract data */
if (mode & ASCON_SQUEEZE) STORE(out, s->x0, n);
/* insert data */
if (mode & ASCON_INSERT) {
s->x0 = CLEAR(s->x0, n);
s->x0 = XOR(s->x0, tmp0);
/* compute permutation for full blocks */
if (n == ASCON_RATE) P(s, nr);
in += n;
out += n;
len -= n;
s->x0 = XOR(s->x0, PAD(n % 8));
