Commit 4063295f by Martin Schlaeffer Committed by Sebastian Renner

new ascon

parent f8a89b2c
#!/bin/bash
RESULTFILE=$1
shift
FACTOR=$1
shift
ALG=$1
shift
IMPL="$*"
echo "RESULTFILE: $RESULTFILE"
echo "FACTOR: $FACTOR"
echo "ALGORITHM: $ALG"
echo "IMPLEMENTATIONS: $IMPL"
echo
rm $RESULTFILE
cmake . -DALG_LIST=$ALG -DTEST_LIST=getcycles
for I in $IMPL; do
for M in 0 1; do
for P in 0 1; do
for U in 0 1; do
rm getcycles_*
rm *.a
cmake . -DIMPL_LIST=${I} -DCOMPILE_DEFS="-DASCON_INLINE_MODE=${M};-DASCON_INLINE_PERM=${P};-DASCON_UNROLL_LOOPS=${U}"
cmake --build .
CYCLES=$(./getcycles_* $FACTOR | tail -n 1)
SIZE=$(size -t *.a | grep TOTALS | awk '{print $1}')
echo -e " $SIZE \t$CYCLES IM=${M} IP=${P} UL=${U} | $I" | grep -v "^0" >> $RESULTFILE
done
done
done
done
sort -n -k7 -t\| -o $RESULTFILE $RESULTFILE
echo -e " size \t| 1 | 8 | 16 | 32 | 64 | 1536 | long | config | implementation\n-------:|------:|------:|------:|------:|------:|------:|------:|:--------------:|:---------------\n$(cat $RESULTFILE)" > $RESULTFILE
#!/bin/sh
FACTOR=$1
ALG=$2
for i in getcycles*${ALG}v12*; do
echo
echo $i:
echo
echo "| 1 | 8 | 16 | 32 | 64 | 1536 | long |"
echo "|------:|------:|------:|------:|------:|------:|------:|"
for n in $(seq 5); do
./$i $FACTOR | tail -n 1
done | sort -n -k7 -t\|
done 2>/dev/null
echo
#include "api.h"
#include "ascon.h"
#include "crypto_aead.h"
#include "permutations.h"
#include "printstate.h"
#define AVX512_SHUFFLE_U64BIG \
_mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, /* word 7 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 6 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 5 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 4 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 3 */ \
-1, -1, -1, -1, -1, -1, -1, -1, /* word 2 */ \
8, 9, 10, 11, 12, 13, 14, 15, /* word 1 */ \
0, 1, 2, 3, 4, 5, 6, 7) /* word 0 */
#if !ASCON_INLINE_MODE
#undef forceinline
#define forceinline
#endif
forceinline void ascon_loadkey(word_t* K0, word_t* K1, word_t* K2,
const uint8_t* k) {
KINIT(K0, K1, K2);
if (CRYPTO_KEYBYTES == 20) {
*K0 = XOR(*K0, KEYROT(WORD_T(0), LOAD(k, 4)));
k += 4;
}
*K1 = XOR(*K1, LOAD(k, 8));
*K2 = XOR(*K2, LOAD(k + 8, 8));
}
forceinline void ascon_init(state_t* s, const uint8_t* npub, const uint8_t* k) {
/* load nonce */
word_t N0 = LOAD(npub, 8);
word_t N1 = LOAD(npub + 8, 8);
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* initialize */
PINIT(s);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 8)
s->x0 = XOR(s->x0, ASCON_128_IV);
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16)
s->x0 = XOR(s->x0, ASCON_128A_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, ASCON_80PQ_IV);
if (CRYPTO_KEYBYTES == 20) s->x0 = XOR(s->x0, K0);
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
s->x3 = XOR(s->x3, N0);
s->x4 = XOR(s->x4, N1);
P(s, 12);
if (CRYPTO_KEYBYTES == 20) s->x2 = XOR(s->x2, K0);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("initialization", s);
}
forceinline void ascon_adata(state_t* s, const uint8_t* ad, uint64_t adlen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t;
if (adlen) {
/* full associated data blocks */
while (adlen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, ad);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
P(&r, nr);
ad += ASCON_RATE;
adlen -= ASCON_RATE;
}
*s = r;
/* final associated data block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && adlen >= 8) {
s->x0 = XOR(s->x0, LOAD(ad, 8));
px = &s->x1;
ad += 8;
adlen -= 8;
}
*px = XOR(*px, PAD(adlen));
if (adlen) *px = XOR(*px, LOAD(ad, adlen));
P(s, nr);
}
/* domain separation */
s->x4 = XOR(s->x4, WORD_T(1));
printstate("process associated data", s);
}
forceinline void ascon_encrypt(state_t* s, uint8_t* c, const uint8_t* m,
uint64_t mlen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t;
/* full plaintext blocks */
while (mlen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, m);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
t.z = _mm512_maskz_shuffle_epi8(mask, r.z, u64big);
_mm512_mask_storeu_epi8(c, mask, t.z);
P(&r, nr);
m += ASCON_RATE;
c += ASCON_RATE;
mlen -= ASCON_RATE;
}
*s = r;
/* final plaintext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && mlen >= 8) {
s->x0 = XOR(s->x0, LOAD(m, 8));
STORE(c, s->x0, 8);
px = &s->x1;
m += 8;
c += 8;
mlen -= 8;
}
*px = XOR(*px, PAD(mlen));
if (mlen) {
*px = XOR(*px, LOAD(m, mlen));
STORE(c, *px, mlen);
}
printstate("process plaintext", s);
}
forceinline void ascon_decrypt(state_t* s, uint8_t* m, const uint8_t* c,
uint64_t clen) {
const __m512i u64big = AVX512_SHUFFLE_U64BIG;
const int mask = (ASCON_RATE == 8) ? 0xff : 0xffff;
const int nr = (ASCON_RATE == 8) ? 6 : 8;
state_t r = *s, t, u;
/* full ciphertext blocks */
while (clen >= ASCON_RATE) {
t.z = _mm512_maskz_loadu_epi8(mask, c);
t.z = _mm512_maskz_shuffle_epi8(mask, t.z, u64big);
r.z = _mm512_xor_epi64(r.z, t.z);
u.z = _mm512_maskz_shuffle_epi8(mask, r.z, u64big);
r.z = _mm512_mask_blend_epi8(mask, r.z, t.z);
_mm512_mask_storeu_epi8(m, mask, u.z);
P(&r, nr);
m += ASCON_RATE;
c += ASCON_RATE;
clen -= ASCON_RATE;
}
*s = r;
/* final ciphertext block */
word_t* px = &s->x0;
if (ASCON_RATE == 16 && clen >= 8) {
word_t cx = LOAD(c, 8);
s->x0 = XOR(s->x0, cx);
STORE(m, s->x0, 8);
s->x0 = cx;
px = &s->x1;
m += 8;
c += 8;
clen -= 8;
}
*px = XOR(*px, PAD(clen));
if (clen) {
word_t cx = LOAD(c, clen);
*px = XOR(*px, cx);
STORE(m, *px, clen);
*px = CLEAR(*px, clen);
*px = XOR(*px, cx);
}
printstate("process ciphertext", s);
}
forceinline void ascon_final(state_t* s, const uint8_t* k) {
/* load key */
word_t K0, K1, K2;
ascon_loadkey(&K0, &K1, &K2, k);
/* finalize */
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 8) {
s->x1 = XOR(s->x1, K1);
s->x2 = XOR(s->x2, K2);
}
if (CRYPTO_KEYBYTES == 16 && ASCON_RATE == 16) {
s->x2 = XOR(s->x2, K1);
s->x3 = XOR(s->x3, K2);
}
if (CRYPTO_KEYBYTES == 20) {
s->x1 = XOR(s->x1, KEYROT(K0, K1));
s->x2 = XOR(s->x2, KEYROT(K1, K2));
s->x3 = XOR(s->x3, KEYROT(K2, WORD_T(0)));
}
P(s, 12);
s->x3 = XOR(s->x3, K1);
s->x4 = XOR(s->x4, K2);
printstate("finalization", s);
}
int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
const unsigned char* m, unsigned long long mlen,
const unsigned char* ad, unsigned long long adlen,
const unsigned char* nsec, const unsigned char* npub,
const unsigned char* k) {
state_t s;
(void)nsec;
*clen = mlen + CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_encrypt(&s, c, m, mlen);
ascon_final(&s, k);
/* set tag */
STOREBYTES(c + mlen, s.x3, 8);
STOREBYTES(c + mlen + 8, s.x4, 8);
return 0;
}
int crypto_aead_decrypt(unsigned char* m, unsigned long long* mlen,
unsigned char* nsec, const unsigned char* c,
unsigned long long clen, const unsigned char* ad,
unsigned long long adlen, const unsigned char* npub,
const unsigned char* k) {
state_t s;
(void)nsec;
if (clen < CRYPTO_ABYTES) return -1;
*mlen = clen = clen - CRYPTO_ABYTES;
/* perform ascon computation */
ascon_init(&s, npub, k);
ascon_adata(&s, ad, adlen);
ascon_decrypt(&s, m, c, clen);
ascon_final(&s, k);
/* verify tag (should be constant time, check compiler output) */
s.x3 = XOR(s.x3, LOADBYTES(c + clen, 8));
s.x4 = XOR(s.x4, LOADBYTES(c + clen + 8, 8));
return NOTZERO(s.x3, s.x4);
}
#define CRYPTO_VERSION "1.2.4"
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
#define ASCON_RATE 16
#ifndef ASCON_H_
#define ASCON_H_
#include <immintrin.h>
#include <stdint.h>
#include "word.h"
typedef union {
__m512i z;
struct {
word_t x0, x1, x2, x3, x4, x5, x6, x7;
};
} state_t;
void ascon_init(state_t* s, const uint8_t* npub, const uint8_t* k);
void ascon_adata(state_t* s, const uint8_t* ad, uint64_t adlen);
void ascon_encrypt(state_t* s, uint8_t* c, const uint8_t* m, uint64_t mlen);
void ascon_decrypt(state_t* s, uint8_t* m, const uint8_t* c, uint64_t clen);
void ascon_final(state_t* s, const uint8_t* k);
#endif /* ASCON_H */
#ifndef CONFIG_H_
#define CONFIG_H_
/* inline the ascon mode */
#ifndef ASCON_INLINE_MODE
#define ASCON_INLINE_MODE 1
#endif
/* inline all permutations */
#ifndef ASCON_INLINE_PERM
#define ASCON_INLINE_PERM 1
#endif
/* unroll permutation loops */
#ifndef ASCON_UNROLL_LOOPS
#define ASCON_UNROLL_LOOPS 1
#endif
#endif /* CONFIG_H_ */
#ifndef ENDIAN_H_
#define ENDIAN_H_
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
/* macros for big endian machines */
#ifdef PRAGMA_ENDIAN
#pragma message("Using macros for big endian machines")
#endif
#define U64BIG(x) (x)
#define U32BIG(x) (x)
#define U16BIG(x) (x)
#elif defined(_MSC_VER) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
/* macros for little endian machines */
#ifdef PRAGMA_ENDIAN
#pragma message("Using macros for little endian machines")
#endif
#define U64BIG(x) \
(((0x00000000000000FFULL & (x)) << 56) | \
((0x000000000000FF00ULL & (x)) << 40) | \
((0x0000000000FF0000ULL & (x)) << 24) | \
((0x00000000FF000000ULL & (x)) << 8) | \
((0x000000FF00000000ULL & (x)) >> 8) | \
((0x0000FF0000000000ULL & (x)) >> 24) | \
((0x00FF000000000000ULL & (x)) >> 40) | \
((0xFF00000000000000ULL & (x)) >> 56))
#define U32BIG(x) \
(((0x000000FF & (x)) << 24) | ((0x0000FF00 & (x)) << 8) | \
((0x00FF0000 & (x)) >> 8) | ((0xFF000000 & (x)) >> 24))
#define U16BIG(x) (((0x00FF & (x)) << 8) | ((0xFF00 & (x)) >> 8))
#else
#error "Ascon byte order macros not defined in endian.h"
#endif
#endif /* ENDIAN_H_ */
#ifndef FORCEINLINE_H_
#define FORCEINLINE_H_
/* define forceinline macro */
#ifdef _MSC_VER
#define forceinline __forceinline
#elif defined(__GNUC__)
#define forceinline inline __attribute__((__always_inline__))
#elif defined(__CLANG__)
#if __has_attribute(__always_inline__)
#define forceinline inline __attribute__((__always_inline__))
#else
#define forceinline inline
#endif
#else
#define forceinline inline
#endif
#endif /* FORCEINLINE_H_ */
Branches reviewed 2020-11-13 by Martin Schläffer.
Addresses reviewed 2020-11-13 by Martin Schläffer.
#include "permutations.h"
#if !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
void P12(state_t* s) { P12ROUNDS(s); }
void P8(state_t* s) { P8ROUNDS(s); }
void P6(state_t* s) { P6ROUNDS(s); }
#endif
#if !ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS
void P(state_t* s, int nr) { PROUNDS(s, nr); }
#endif
#ifndef PERMUTATIONS_H_
#define PERMUTATIONS_H_
#include <stdint.h>
#include "api.h"
#include "ascon.h"
#include "config.h"
#include "printstate.h"
#include "round.h"
#define ASCON_128_KEYBYTES 16
#define ASCON_128A_KEYBYTES 16
#define ASCON_80PQ_KEYBYTES 20
#define ASCON_128_RATE 8
#define ASCON_128A_RATE 16
#define ASCON_128_PA_ROUNDS 12
#define ASCON_128_PB_ROUNDS 6
#define ASCON_128A_PB_ROUNDS 8
#define ASCON_HASH_BYTES 32
#define ASCON_128_IV WORD_T(0x80400c0600000000)
#define ASCON_128A_IV WORD_T(0x80800c0800000000)
#define ASCON_80PQ_IV WORD_T(0xa0400c0600000000)
#define ASCON_HASH_IV WORD_T(0x00400c0000000100)
#define ASCON_XOF_IV WORD_T(0x00400c0000000000)
#define ASCON_HASH_IV0 WORD_T(0xee9398aadb67f03dull)
#define ASCON_HASH_IV1 WORD_T(0x8bb21831c60f1002ull)
#define ASCON_HASH_IV2 WORD_T(0xb48a92db98d5da62ull)
#define ASCON_HASH_IV3 WORD_T(0x43189921b8f8e3e8ull)
#define ASCON_HASH_IV4 WORD_T(0x348fa5c9d525e140ull)
#define ASCON_XOF_IV0 WORD_T(0xb57e273b814cd416ull)
#define ASCON_XOF_IV1 WORD_T(0x2b51042562ae2420ull)
#define ASCON_XOF_IV2 WORD_T(0x66a3a7768ddf2218ull)
#define ASCON_XOF_IV3 WORD_T(0x5aad0a7a8153650cull)
#define ASCON_XOF_IV4 WORD_T(0x4f3e0e32539493b6ull)
#define START(n) ((3 + (n)) << 4 | (12 - (n)))
#define RC(c) WORD_T(c)
forceinline void P12ROUNDS(state_t* s) {
ROUND(s, RC(0xf0));
ROUND(s, RC(0xe1));
ROUND(s, RC(0xd2));
ROUND(s, RC(0xc3));
ROUND(s, RC(0xb4));
ROUND(s, RC(0xa5));
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
}
forceinline void P8ROUNDS(state_t* s) {
ROUND(s, RC(0xb4));
ROUND(s, RC(0xa5));
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
}
forceinline void P6ROUNDS(state_t* s) {
ROUND(s, RC(0x96));
ROUND(s, RC(0x87));
ROUND(s, RC(0x78));
ROUND(s, RC(0x69));
ROUND(s, RC(0x5a));
ROUND(s, RC(0x4b));
}
forceinline void PROUNDS(state_t* s, int nr) {
for (int i = START(nr); i > 0x4a; i -= 0x0f) ROUND(s, RC(i));
}
#if ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
forceinline void P(state_t* s, int nr) {
if (nr == 12) P12ROUNDS(s);
if (nr == 8) P8ROUNDS(s);
if (nr == 6) P6ROUNDS(s);
}
#elif !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
void P12(state_t* s);
void P8(state_t* s);
void P6(state_t* s);
forceinline void P(state_t* s, int nr) {
if (nr == 12) P12(s);
if (nr == 8) P8(s);
if (nr == 6) P6(s);
}
#elif ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS
forceinline void P(state_t* s, int nr) { PROUNDS(s, nr); }
#else /* !ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS */
void P(state_t* s, int nr);
#endif
#endif /* PERMUTATIONS_H_ */
#ifdef ASCON_PRINTSTATE
#include "printstate.h"
#include <inttypes.h>
#include <stdio.h>
void printword(const char* text, const word_t x) {
printf("%s=%016" PRIx64 "\n", text, WORDTOU64(x));
}
void printstate(const char* text, const state_t* s) {
printf("%s:\n", text);
printword(" x0", s->x0);
printword(" x1", s->x1);
printword(" x2", s->x2);
printword(" x3", s->x3);
printword(" x4", s->x4);
}
#endif
#ifndef PRINTSTATE_H_
#define PRINTSTATE_H_
#ifdef ASCON_PRINTSTATE
#include "ascon.h"
#include "word.h"
void printword(const char* text, const word_t x);
void printstate(const char* text, const state_t* s);
#else
#define printword(text, w) \
do { \
} while (0)
#define printstate(text, s) \
do { \
} while (0)
#endif
#endif /* PRINTSTATE_H_ */
#ifndef ROUND_H_
#define ROUND_H_
#include "ascon.h"
#include "printstate.h"
forceinline void KINIT(word_t* K0, word_t* K1, word_t* K2) {
*K0 = WORD_T(0);
*K1 = WORD_T(0);
*K2 = WORD_T(0);
}
forceinline void PINIT(state_t* s) {
s->x0 = WORD_T(0);
s->x1 = WORD_T(0);
s->x2 = WORD_T(0);
s->x3 = WORD_T(0);
s->x4 = WORD_T(0);
}
forceinline void ROUND(state_t* s, word_t C) {
uint64_t x = 0;
__mmask8 mxor1 = 0x15;
__mmask8 mxor2 = 0x0b;
__m512i pxor1 = _mm512_set_epi64(x, x, x, 3, x, 1, x, 4);
__m512i pxor2 = _mm512_set_epi64(x, x, x, x, 2, x, 0, 4);
__m512i c = _mm512_set_epi64(x, x, x, 0, 0, C, 0, 0);
__m512i n = _mm512_set_epi64(x, x, x, 0, 0, ~0ull, 0, 0);
__m512i pchi1 = _mm512_set_epi64(x, x, x, 0, 4, 3, 2, 1);
__m512i pchi2 = _mm512_set_epi64(x, x, x, 1, 0, 4, 3, 2);
__m512i rot1 = _mm512_set_epi64(x, x, x, 7, 10, 1, 61, 19);
__m512i rot2 = _mm512_set_epi64(x, x, x, 41, 17, 6, 39, 28);
__m512i t0, t1, t2;
/* round constant + s-box layer */
t0 = _mm512_maskz_permutexvar_epi64(mxor1, pxor1, s->z);
t0 = _mm512_ternarylogic_epi64(s->z, t0, c, 0x96);
/* keccak s-box start */
t1 = _mm512_permutexvar_epi64(pchi1, t0);
t2 = _mm512_permutexvar_epi64(pchi2, t0);
t0 = _mm512_ternarylogic_epi64(t0, t1, t2, 0xd2);
/* keccak s-box end */
t1 = _mm512_maskz_permutexvar_epi64(mxor2, pxor2, t0);
t0 = _mm512_ternarylogic_epi64(t0, t1, n, 0x96);
/* linear layer */
t1 = _mm512_rorv_epi64(t0, rot1);