#CC=gcc -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse3 -mmmx -mavx -mavx2
#CC=gcc -Wall -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse2 -mmmx -mavx -mavx2
CC=gcc -Wall -O3 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -mavx -mavx2
#CC=gcc -O1 -fomit-frame-pointer -march=haswell -mtune=native -mavx
#ivybridgei, skylake, sandybridge, haswell
all: aceavx_1
aceavx: speed.c ace.c
$(CC) -o $@ $^
.PHONY: clean
rm -fr aceavx
/* Reference implementation of ACE-128, AEAD
Written by:
Kalikinkar Mandal <>
#ifndef ACE_H
#define ACE_H
//#include <immintrin.h>
#define STATEBYTES 40
#define STATEDWORD 10 // 320/32 = 8//
#define NUMSTEPS 16
#define PARAL_INST_BY8 1
static const unsigned char SC0[16]={0x50,0x5c,0x91,0x8d,0x53,0x60,0x68,0xe1,0xf6,0x9d,0x40,0x4f,0xbe,0x5b,0xe9,0x7f}; //Step constants (SC_{2i})
static const unsigned char SC1[16]={0x28,0xae,0x48,0xc6,0xa9,0x30,0x34,0x70,0x7b,0xce,0x20,0x27,0x5f,0xad,0x74,0x3f}; //Step constants (SC_{2i+1})
static const unsigned char SC2[16]={0x14,0x57,0x24,0x63,0x54,0x18,0x9a,0x38,0xbd,0x67,0x10,0x13,0x2f,0xd6,0xba,0x1f}; //Step constants (SC_{2i+2})
static const unsigned char RC0[16]={0x07,0x0a,0x9b,0xe0,0xd1,0x1a,0x22,0xf7,0x62,0x96,0x71,0xaa,0x2b,0xe9,0xcf,0xb7};//Round constants (RC_{2i})
static const unsigned char RC1[16]={0x53,0x5d,0x49,0x7f,0xbe,0x1d,0x28,0x6c,0x82,0x47,0x6b,0x88,0xdc,0x8b,0x59,0xc6};//Round constants (RC_{2i+1})
static const unsigned char RC2[16]={0x43,0xe4,0x5e,0xcc,0x32,0x4e,0x75,0x25,0xfd,0xf9,0x76,0xa0,0xb0,0x09,0x1e,0xad};//Round constants (RC_{2i+2})
typedef unsigned long long int u64;
typedef unsigned int u32;
typedef unsigned int u8;
#define ROT5(x) (_mm256_slli_epi32(x, 5) | _mm256_srli_epi32(x, 27))
#define ROT1(x) (_mm256_slli_epi32(x, 1) | _mm256_srli_epi32(x, 31))
#define SWAPREG1(x) (_mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)))
#define RC(t1, t2) (_mm256_set_epi32(0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2,0xfffffffe^t1))
#define SC(t1, t2) (_mm256_set_epi32(0xffffff00^t2, 0xffffffff, 0xffffff00^t1, 0xffffffff, 0xffffff00^t2, 0xffffffff, 0xffffff00^t1, 0xffffffff))
#define SWAPBLK(x) (_mm256_permute4x64_epi64(x, _MM_SHUFFLE(2,3,0,1)))
#define SWAPREG2(x) (_mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0)))
#define SWAPAC(x) (_mm256_permutevar8x32_epi32(xtmp, _mm256_set_epi32(5, 4, 7, 6, 1, 0, 3, 2)))
#define masklo (_mm256_set_epi32(0x0, 0x0, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff,0xffffffff))
#define maskhi (_mm256_set_epi32(0xffffffff, 0xffffffff, 0xffffffff,0xffffffff,0x0, 0x0, 0x0, 0x0))
#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
#define ROAX(x, y, t1, t2)\
__m256i x2tmp;\
x2tmp = x;\
x = (ROT5(x)&x)^ROT1(x)^RC(t1, t2)^y;\
y = x2tmp;\
#define PACK_SSb(x, y)\
__m256i xtmp, ytmp;\
xtmp = SWAPREG1(x);\
ytmp = SWAPREG1(y);\
x = _mm256_permute2x128_si256(xtmp,ytmp,0x20);\
y = _mm256_permute2x128_si256(xtmp,ytmp, 0x31);\
#define UNPACK_SSb(x, y)\
__m256i xtmp, ytmp;\
xtmp = _mm256_unpacklo_epi32(x, y);\
ytmp = _mm256_unpackhi_epi32(x, y);\
x = _mm256_permute2x128_si256(xtmp, ytmp,0x20);\
y = _mm256_permute2x128_si256(xtmp, ytmp, 0x31);\
#define PACK(x, y, z, w)\
__m256i x2tmp, x3tmp;\
x2tmp = SWAPREG2(x);\
x3tmp = SWAPREG2(z);\
x = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
z = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
x2tmp = SWAPREG2(y);\
x3tmp = SWAPREG2(w);\
y = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
w = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
#define UNPACK(x,y,z,w)\
__m256i x2tmp, x3tmp;\
x2tmp = _mm256_unpacklo_epi64(x, z);\
x3tmp = _mm256_unpackhi_epi64(x, z);\
x = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
z = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
x2tmp = _mm256_unpacklo_epi64(y, w);\
x3tmp = _mm256_unpackhi_epi64(y, w);\
y = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
w = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
void ace320( u32 *state );
int crypto_aead_encrypt( u32 *tag, u32 tlen, u32 *c, u32 *m, u32 mlen, u32 *ad, u32 adlen, u8 *k, u8 *npub, u32 klen );
int crypto_aead_decrypt( u32 *m, u32 *c, u32 mlen, u32 *tag, u32 tlen, u32 *ad, u32 adlen, u8 *k, u8 *npub, u32 klen );
typedef unsigned long long int u64;
u64 start_rdtsc( )
unsigned high, low;
__asm__ volatile("CPUID\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (high),
"=r" (low):: "%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
u64 end_rdtsc( )
unsigned high, low;
__asm__ volatile("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax,%1\n\t"
"CPUID\n\t": "=r" (high), "=r" (low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
/* Reference implementation of ACE-128, AEAD
Written by:
Kalikinkar Mandal <>
#include "ace.h"
#include "clock_cycle.h"
#define NUM_ITER 2000
#define NUM_TEST 500
void print_state ( u32 *state )
u8 i, j;
for ( j = 0; j < 8*PARAL_INST_BY8; j++ )
for ( i = 0; i < STATEDWORD; i++ )
printf("%.8X", state[i+j*STATEDWORD]);
int main()
u8 num_parallel_inst;
u32 *state;
int i, j;
u64 t[NUM_ITER+1], count_cc;
u32 *plaintext, *ciphertext, *tag, *key, *nonce, *ad;
u32 plen, tlen, klen, adlen;
u8 *k, *pubn;
num_parallel_inst = 8*PARAL_INST_BY8;
adlen = 4; // Associated data length = adlen*32;
plen = 32; // Message length = plen*32;
tlen = 4; // 128 = 32*4 bits
klen = 4; // 128 = 32*4 bits
key = (u32 *)malloc(sizeof(u32)*klen*num_parallel_inst);
nonce = (u32 *)malloc(sizeof(u32)*klen*num_parallel_inst);
tag = (u32 *)malloc(sizeof(u32)*tlen*num_parallel_inst);
ad = (u32 *)malloc(sizeof(u32)*adlen*num_parallel_inst);
plaintext = (u32 *)malloc(sizeof(u32)*plen*num_parallel_inst);
ciphertext = (u32 *)malloc(sizeof(u32)*plen*num_parallel_inst);
k = (u8 *)malloc(sizeof(u8)*16*num_parallel_inst);
pubn = (u8 *)malloc(sizeof(u8)*16*num_parallel_inst);
state = (u32 *)malloc(sizeof(u32)*num_parallel_inst*STATEDWORD);
//Randomly initilizes messages to states
for ( i = 0; i < num_parallel_inst*STATEDWORD; i++ )
*(state+i) = i%STATEDWORD;
//*(state+i) = 0x01;
ace320( state );
//ace320( state );
//Assinging 128-bit keys, nonces and messages//
for ( i = 0; i < num_parallel_inst*klen; i++ )
*(nonce+i) = 0x40404040;
*(key+i) = 0x10101010;
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
*(plaintext+plen*i+j) = j%plen;
for ( i = 0; i < num_parallel_inst*adlen; i++ )
*(ad+i) = 0xffffffff;
for ( i = 0; i < num_parallel_inst*16; i++ )
k[i] = 0x0;
*(pubn+i) = 0;
for ( i = 0; i < NUM_ITER+1; i++ )
count_cc = start_rdtsc();
for ( j = 0; j < NUM_TEST; j++ )
ace320( state );
count_cc = end_rdtsc()-count_cc;
printf("Cycles per byte = %f\n", (double)(count_cc)/(double)(num_parallel_inst*STATEBYTES*NUM_TEST));
// AEAD: Encryption and Decryption Module//
printf("Nonce and Key:\n");
for ( i = 0; i < num_parallel_inst; i++ )
printf( "%08X%08X%08X%08X", nonce[4*i+0], nonce[4*i+1], nonce[4*i+2], nonce[4*i+3]);
printf( "%08X%08X%08X%08X\n", key[4*i+0], key[4*i+1], key[4*i+2], key[4*i+3]);
for ( i = 0; i < NUM_ITER; i++ )
count_cc = start_rdtsc();
crypto_aead_encrypt( tag, tlen, ciphertext, plaintext, plen, ad, adlen, k, pubn, 16 );
count_cc = end_rdtsc()-count_cc;
printf("Encryption speed = %f cpb\n", (double)(count_cc)/(double)(num_parallel_inst*plen*4));
//Checking Correctness of Autheticated encryption and Decryption
crypto_aead_encrypt( tag, tlen, ciphertext, plaintext, plen, ad, adlen, k, pubn, 16 );
printf("Original plaintext:\n");
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", plaintext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", ciphertext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < tlen; j++ )
printf("%08X", tag[i*tlen+j]);
crypto_aead_decrypt( plaintext, ciphertext, plen, tag, tlen, ad, adlen, k, pubn, 16 );
printf("Decrypted plaintext:\n");
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", plaintext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < tlen; j++ )
printf("%08X", tag[i*tlen+j]);
#CC=gcc -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse3 -mmmx -mavx -mavx2
#CC=gcc -Wall -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse2 -mmmx -mavx -mavx2
CC=gcc -Wall -O3 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse2
#CC=gcc -O1 -fomit-frame-pointer -march=haswell -mtune=native -mavx
#ivybridgei, skylake, sandybridge, haswell
all: acesse2_1
acesse2: speed.c ace.c
$(CC) -o $@ $^
.PHONY: clean
rm -fr acesse2
/* Reference implementation of ACE-128, AEAD
Written by:
Kalikinkar Mandal <>
#ifndef ACE_H
#define ACE_H
#define STATEBYTES 40
#define STATEDWORD 10 // 320/32 = 8//
//#define NUMSTEPS 16
#define NUMSTEPS 16
#define PARAL_INST_BY4 1
static const unsigned char SC0[16]={0x50,0x5c,0x91,0x8d,0x53,0x60,0x68,0xe1,0xf6,0x9d,0x40,0x4f,0xbe,0x5b,0xe9,0x7f}; //Step constants (SC_{2i})
static const unsigned char SC1[16]={0x28,0xae,0x48,0xc6,0xa9,0x30,0x34,0x70,0x7b,0xce,0x20,0x27,0x5f,0xad,0x74,0x3f}; //Step constants (SC_{2i+1})
static const unsigned char SC2[16]={0x14,0x57,0x24,0x63,0x54,0x18,0x9a,0x38,0xbd,0x67,0x10,0x13,0x2f,0xd6,0xba,0x1f}; //Step constants (SC_{2i+2})
static const unsigned char RC0[16]={0x07,0x0a,0x9b,0xe0,0xd1,0x1a,0x22,0xf7,0x62,0x96,0x71,0xaa,0x2b,0xe9,0xcf,0xb7};//Round constants (RC_{2i})
static const unsigned char RC1[16]={0x53,0x5d,0x49,0x7f,0xbe,0x1d,0x28,0x6c,0x82,0x47,0x6b,0x88,0xdc,0x8b,0x59,0xc6};//Round constants (RC_{2i+1})
static const unsigned char RC2[16]={0x43,0xe4,0x5e,0xcc,0x32,0x4e,0x75,0x25,0xfd,0xf9,0x76,0xa0,0xb0,0x09,0x1e,0xad};//Round constants (RC_{2i+2})
typedef unsigned long long int u64;
typedef unsigned int u32;
typedef unsigned int u8;
#define ROT5(x) (_mm_slli_epi32(x, 5) | _mm_srli_epi32(x, 27))
#define ROT1(x) (_mm_slli_epi32(x, 1) | _mm_srli_epi32(x, 31))
#define RC(t1, t2) (_mm_set_epi32(0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2, 0xfffffffe^t1))
#define SC(t1, t2) (_mm_set_epi32(0xffffff00^t2, 0xffffffff, 0xffffff00^t1, 0xffffffff ))
#define SWAPREG1(x) (_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 1, 2, 0)))
#define SWAPBLK(x) (_mm_slli_si128(x, 8)|_mm_srli_si128(x, 8))
#define masklo (_mm_set_epi32(0x0, 0x0, 0xffffffff, 0xffffffff ))
#define maskhi (_mm_set_epi32(0xffffffff, 0xffffffff, 0x0, 0x0 ))
#define ROAX(x, y, t1, t2)\
__m128i xtmp;\
xtmp = x;\
x = (ROT5(x)&x)^ROT1(x)^RC(t1, t2)^y;\
y = xtmp;\
#define PACK_SSb(x, y)\
__m128i xtmp, ytmp;\
xtmp = SWAPREG1(x);\
ytmp = SWAPREG1(y);\
x = _mm_unpacklo_epi64(xtmp, ytmp);\
y = _mm_unpackhi_epi64(xtmp, ytmp);\
#define UNPACK_SSb(x, y)\
__m128i xtmp, ytmp;\
xtmp = _mm_unpacklo_epi32(x, y);\
ytmp = _mm_unpackhi_epi32(x, y);\
x = xtmp;\
y = ytmp;\
#define PACK(x, y, z, w, state, i1, i2, i3, i4)\
__m128i xtmp, ytmp;\
xtmp = _mm_loadu_si128((void *) (state + i1));\
ytmp = _mm_loadu_si128((void *) (state + i2));\
x = _mm_unpacklo_epi64(xtmp, ytmp);\
z = _mm_unpackhi_epi64(xtmp, ytmp);\
xtmp = _mm_loadu_si128((void *) (state + i3));\
ytmp = _mm_loadu_si128((void *) (state + i4));\
y = _mm_unpacklo_epi64(xtmp, ytmp);\
w = _mm_unpackhi_epi64(xtmp, ytmp);\
#define UNPACK(x, y, z, w)\
__m128i xtmp, ytmp;\
xtmp = _mm_unpacklo_epi64(x, z);\
ytmp = _mm_unpackhi_epi64(x, z);\
x = xtmp;\
z = ytmp;\
xtmp = _mm_unpacklo_epi64(y, w);\
ytmp = _mm_unpackhi_epi64(y, w);\
y = xtmp;\
w = ytmp;\
void ace320( u32 *state );
void ace_encrypt( u32 *tag, u32 tlen, u32 *ciphertext, u32 *plaintext, u32 plen, u32 *key, u32 *nonce, u32 klen );
int crypto_aead_encrypt( u32 *tag, u32 tlen, u32 *c, u32 *m, u32 mlen, u32 *ad, u32 adlen, u8 *k, u8 *npub, u32 klen );
int crypto_aead_decrypt( u32 *m, u32 *c, u32 mlen, u32 *tag, u32 tlen, u32 *ad, u32 adlen, u8 *k, u8 *npub, u32 klen );
typedef unsigned long long int u64;
u64 start_rdtsc( )
unsigned high, low;
__asm__ volatile("CPUID\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (high),
"=r" (low):: "%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
u64 end_rdtsc( )
unsigned high, low;
__asm__ volatile("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax,%1\n\t"
"CPUID\n\t": "=r" (high), "=r" (low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
/* Reference implementation of ACE-128, AEAD
Written by:
Kalikinkar Mandal <>
#include "ace.h"
#include "clock_cycle.h"
#define NUM_ITER 2000
#define NUM_TEST 500
void print_state ( u32 *state )
u8 i, j;
for ( j = 0; j < 4*PARAL_INST_BY4; j++ )
for ( i = 0; i < STATEDWORD; i++ )
printf("%.8X", state[i+j*STATEDWORD]);
int main()
u8 num_parallel_inst;
u32 *state;
int i, j;
u64 t[NUM_ITER+1], count_cc;
u32 *plaintext, *ciphertext, *tag, *key, *nonce, *ad;
u32 plen, tlen, klen, adlen;
u8 *k, *pubn;
num_parallel_inst = 4*PARAL_INST_BY4;
adlen = 4; // Associated data length = adlen*32;
plen = 32; // Message length = plen*32;
tlen = 4; // 128 = 32*4 bits
klen = 4; // 128 = 32*4 bits
key = (u32 *)malloc(sizeof(u32)*klen*num_parallel_inst);
nonce = (u32 *)malloc(sizeof(u32)*klen*num_parallel_inst);
tag = (u32 *)malloc(sizeof(u32)*tlen*num_parallel_inst);
ad = (u32 *)malloc(sizeof(u32)*adlen*num_parallel_inst);
plaintext = (u32 *)malloc(sizeof(u32)*plen*num_parallel_inst);
ciphertext = (u32 *)malloc(sizeof(u32)*plen*num_parallel_inst);
k = (u8 *)malloc(sizeof(u8)*16*num_parallel_inst);
pubn = (u8 *)malloc(sizeof(u8)*16*num_parallel_inst);
state = (u32 *)malloc(sizeof(u32)*num_parallel_inst*STATEDWORD);
//Randomly initilizes messages to states
for ( i = 0; i < num_parallel_inst*STATEDWORD; i++ )
*(state+i) = i%STATEDWORD;
//*(state+i) = 0x01;
ace320( state );
//ace320( state );
//Assinging 128-bit keys, nonces and messages//
for ( i = 0; i < num_parallel_inst*klen; i++ )
*(nonce+i) = 0x40404040;
*(key+i) = 0x10101010;
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
*(plaintext+i*plen+j) = j%plen;
for ( i = 0; i < num_parallel_inst*adlen; i++ )
*(ad+i) = 0xffffffff;
for ( i = 0; i < num_parallel_inst*16; i++ )
*(k+i) = 0x0;
*(pubn+i) = 0x0;
for ( i = 0; i < NUM_ITER+1; i++ )
count_cc = start_rdtsc();
for ( j = 0; j < NUM_TEST; j++ )
ace320( state );
count_cc = end_rdtsc()-count_cc;
printf("Cycles per byte = %f\n", (double)(count_cc)/(double)(num_parallel_inst*STATEBYTES*NUM_TEST));
// AEAD: Encryption and Decryption Module//
printf("Nonce and Key:\n");
for ( i = 0; i < num_parallel_inst; i++ )
printf( "%08X%08X%08X%08X", nonce[4*i+0], nonce[4*i+1], nonce[4*i+2], nonce[4*i+3]);
printf( "%08X%08X%08X%08X\n", key[4*i+0], key[4*i+1], key[4*i+2], key[4*i+3]);
for ( i = 0; i < 2000; i++ )
count_cc = start_rdtsc();
crypto_aead_encrypt( tag, tlen, ciphertext, plaintext, plen, ad, adlen, k, pubn, 16 );
count_cc = end_rdtsc()-count_cc;
printf("Encryption speed = %f cpb\n", (double)(count_cc)/(double)(num_parallel_inst*plen*4));
crypto_aead_encrypt( tag, tlen, ciphertext, plaintext, plen, ad, adlen, k, pubn, 16 );
printf("Original plaintext:\n");
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", plaintext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", ciphertext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < tlen; j++ )
printf("%08X", tag[i*tlen+j]);
crypto_aead_decrypt( plaintext, ciphertext, plen, tag, tlen, ad, adlen, k, pubn, 16 );
printf("Decrypted plaintext:\n");
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", plaintext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < tlen; j++ )
printf("%08X", tag[i*tlen+j]);
-- This work is licensed under a Creative Commons
-- Attribution-NonCommercial-ShareAlike 4.0 International License.
-- Mark D. Aagaard
-- Riham AlTawy
-- Guang Gong
-- Kalikinkar Mandal
-- Raghvendra Rohit
-- Marat Sattarov
-- This is a human-readable summary of (and not a substitute for) the license.
-- You are free to:
-- Share — copy and redistribute the material in any medium or format
-- Adapt — remix, transform, and build upon the material
-- The licensor cannot revoke these freedoms as long as you follow
-- the license terms.
-- Under the following terms:
-- Attribution — You must give appropriate credit, provide a link to
-- the license, and indicate if changes were made. You may do so in
-- any reasonable manner, but not in any way that suggests the
-- licensor endorses you or your use.
-- NonCommercial — You may not use the material for commercial
-- purposes.
-- ShareAlike — If you remix, transform, or build upon the material,
-- you must distribute your contributions under the same license as
-- the original.
-- No additional restrictions — You may not apply legal terms or
-- technological measures that legally restrict others from doing
-- anything the license permits.
architecture rtl of ace is
signal ctl_control : ace_ctl_ty;
signal ctl_onehot : onehot_ty;
signal ctl_lfsr_en : std_logic;
signal ctl_lfsr_reset : std_logic;
u_dp :
entity work.dp port map
( clk => clk
, reset => reset
, i_mode => i_mode
, i_control => ctl_control
, i_onehot => ctl_onehot
, i_dom_sep => i_dom_sep
, i_valid => i_valid
, i_data => i_data
, i_padding => i_padding
, o_data => o_data
u_ctl :
entity work.ctl port map
( clk => clk
, reset => reset
, i_mode => i_mode
, i_dom_sep => i_dom_sep
, i_valid => i_valid
, i_padding => i_padding
, o_valid => o_valid
, o_onehot => ctl_onehot
, o_ready => o_ready
, o_control => ctl_control
end architecture;
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use work.ace_pkg.all;
entity ace is
( clk : in std_logic;
reset : in std_logic;
i_mode : in mode_ty;
i_dom_sep : in domsep_ty;
i_valid : in std_logic;
i_data : in word;
i_padding : in std_logic;
o_valid : out std_logic;
o_ready : out std_logic;
o_data : out word
end entity;
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
package ace_pkg is
--for constants
constant lfsr_c_sz : integer := 7;
subtype lfsr_c_output is std_logic_vector(0 to lfsr_c_sz+2);
constant half_word_sz : natural := 32;
constant word_sz : natural := 2*half_word_sz;
subtype half_word is std_logic_vector( 0 to half_word_sz - 1 );
subtype word is std_logic_vector( 0 to word_sz - 1 );
type word_vector is array( natural range <> ) of word;
type half_word_vector is array( natural range <> ) of half_word;
-- A, B, C, D, E
constant state_sz : natural := 320;
constant word_max_idx : natural := state_sz / word_sz - 1;
constant half_word_max_idx : natural := state_sz / half_word_sz - 1;
constant key_sz : natural := 128;
constant nonce_sz : natural := 128;
subtype word_state_ty is word_vector ( 0 to word_max_idx );
constant a_idx : natural := 0;
constant b_idx : natural := 1;
constant c_idx : natural := 2;
constant d_idx : natural := 3;
constant e_idx : natural := 4;
subtype half_word_data is half_word_vector ( 0 to 1 );
subtype half_word_state_ty is half_word_vector ( 0 to half_word_max_idx );
constant a0_idx : natural := 1;
constant a1_idx : natural := 0;
constant b0_idx : natural := 3;
constant b1_idx : natural := 2;
constant c0_idx : natural := 5;
constant c1_idx : natural := 4;
constant d0_idx : natural := 7;
constant d1_idx : natural := 6;
constant e0_idx : natural := 9;
constant e1_idx : natural := 8;
function b2x( b : boolean ) return std_logic;
function half_words_to_words( st : half_word_state_ty ) return word_state_ty;
function words_to_half_words( st : word_state_ty ) return half_word_state_ty;
-- mode
subtype mode_ty is std_logic_vector( 1 downto 0 ); -- top lvl input
constant encrypt_mode : mode_ty := ( 1 => '0', 0 => '0' );
constant decrypt_mode : mode_ty := ( 1 => '0', 0 => '1' );
constant absorb_mode : mode_ty := ( 1 => '1', 0 => '0' );
constant squeeze_mode : mode_ty := ( 1 => '1', 0 => '1' );
subtype domsep_ty is std_logic_vector( 1 downto 0 ); -- top lvl input
-- derived control (from counter and more)
subtype ace_ctl_ty is std_logic_vector( 7 downto 0 );
constant absorb_idx : natural := 0;
constant replace_idx : natural := 1;
constant output_idx : natural := 2;
constant endstep_idx : natural := 3;
constant permoff_idx : natural := 4;
constant squeeze_idx : natural := 5;
constant lfsr_c_reset_idx : natural := 6;
constant lfsr_c_en_idx : natural := 7;
-- extras cntl for load, init, fin, tag, sqeeze
subtype onehot_ty is std_logic_vector( 3 downto 0); -- extrs cntl for load, init, fin, tag, sqeeze
-- round and step counters
-- use last bit for end ACE perm - for o_ready
-- -> i_valid will reset the counter!
-- counter only runs if msb = 0
constant bits_counter : natural := 8;
subtype count_ty is unsigned( bits_counter - 1 downto 0 );
-- standard vhdl operators
-- function "sll"( a : half_word; n : natural ) return half_word;
function onehot_rotate (a : onehot_ty) return onehot_ty;
function vector_to_data ( st : half_word_data ) return word;
function data_to_vector ( st : word ) return half_word_data;
end package;
package body ace_pkg is
function onehot_rotate (a : onehot_ty)
return onehot_ty
variable z : onehot_ty;
z(onehot_ty'high downto 1) := a(onehot_ty'high - 1 downto 0);
z(0) := a(onehot_ty'high);
return z;
end function;
function b2x( b : boolean ) return std_logic is
if b then
return '1';
return '0';
end if;
end function;
-- standard vhdl operators cast to state
-- function "sll"( a : half_word; n : natural ) return half_word is
-- begin
-- return half_word( std_logic_vector( a ) sll n );
-- end function;
-- state functions
function half_words_to_words( st : half_word_state_ty ) ---- CHECK THIS!!!!!! PLEASE
return word_state_ty
variable i : natural;
variable z : word_state_ty;
main_loop : for i in 0 to word_max_idx loop
z(i)(0 to half_word_sz - 1) := st(2*i);
z(i)(half_word_sz to word_sz - 1) := st(2*i+1);
end loop;
return z;
end function;
function words_to_half_words( st : word_state_ty )
return half_word_state_ty
variable i : natural;
variable z : half_word_state_ty;
main_loop : for i in 0 to word_max_idx loop
z(2*i) := st(i)(0 to half_word_sz - 1);
z(2*i+1) := st(i)(half_word_sz to word_sz - 1);
end loop;
return z;
end function;
function data_to_vector( st : word )
return half_word_data
variable z : half_word_data;
z(0) := st(0 to half_word_sz - 1);
z(1) := st(half_word_sz to word_sz - 1);
return z;
end function;
function vector_to_data( st : half_word_data )
return word
variable z : word;
z(0 to half_word_sz - 1) := st(0);
z(half_word_sz to word_sz - 1) := st(1);
return z;
end function;
end package body;
if { $gui_mode } {
add wave clk
add wave reset
add wave i_mode
add wave i_dom_sep
add wave o_ready
add wave i_valid
add wave i_data
add wave i_padding
add wave o_valid
add wave o_data
if { $sim_mode eq "PROG_MODE" } then {
add wave -noupdate -divider -height 32 STUFF
add wave /uut/u_ctl/state
add wave /uut/u_ctl/o_ready
add wave /uut/u_ctl/i_valid
add wave /uut/u_dp/i_data
add wave /uut/u_ctl/o_valid
add wave /uut/u_dp/o_data
add wave -noupdate -divider -height 32 DP
add wave -radix binary /uut/u_dp/ctl_const
add wave /uut/u_dp/i_data
add wave /uut/u_dp/o_data
add wave -radix binary /uut/u_dp/ctl_const
add wave /uut/u_dp/lfsr_c_en
add wave /uut/u_dp/lfsr_c_reset
add wave /uut/u_dp/permoff
add wave /uut/u_dp/endstep
add wave /uut/u_dp/absorb
add wave /uut/u_dp/replace
add wave /uut/u_dp/output
add wave /uut/u_dp/dsxor
add wave /uut/u_dp/post_input
add wave /uut/u_dp/pre_round
add wave /uut/u_dp/post_round
add wave /uut/u_dp/post_xor
add wave /uut/u_dp/post_step_const
add wave /uut/u_dp/post_linear
add wave /uut/u_dp/ace_path
add wave /uut/u_dp/ace_state
add wave -noupdate -divider -height 32 CTL
add wave /uut/u_ctl/state
add wave -radix unsigned /uut/u_ctl/count
add wave /uut/u_ctl/i_valid
add wave /uut/u_ctl/o_valid
add wave /uut/u_ctl/o_ready
add wave -radix binary /uut/u_ctl/onehot
add wave /uut/u_ctl/lfsr_c_reset
add wave -radix binary /uut/u_ctl/i_mode
add wave -radix binary /uut/u_ctl/i_dom_sep
vcd file ace.vcd
vcd add /ace_tb/uut/*
vcd add -r *
vcd on
run -all
vcd checkpoint
vcd off
vcd flush
if { $gui_mode } {
wave zoom full
} else {
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use work.ace_pkg.all;
entity dp is
( clk : in std_logic
; reset : in std_logic
; i_mode : in mode_ty
; i_control : in ace_ctl_ty
; i_onehot : in onehot_ty
; i_dom_sep : in domsep_ty
; i_valid : in std_logic
; i_data : in word
; i_padding : in std_logic
; o_data : out word
end entity;
architecture rtl of dp is
signal permoff, endstep, squeeze,
absorb, replace, output,
lfsr_reset, lfsr_en : std_logic;
signal ace_state, post_input : half_word_state_ty;
signal pre_round, post_round,
post_xor, post_step_const,
post_linear, ace_path : word_state_ty;
signal dsxor : half_word;
signal i_data_vector, o_data_vector : half_word_data;
signal ctl_const : lfsr_c_output;
u_lfsr :
entity work.lfsr port map
( clk => clk
, reset => lfsr_reset
, lfsr_en => lfsr_en
, o_const => ctl_const
i_data_vector <= data_to_vector( i_data );
o_data <= vector_to_data( o_data_vector );
absorb <= i_control( absorb_idx );
replace <= i_control( replace_idx );
output <= i_control( output_idx );
endstep <= i_control( endstep_idx );
permoff <= i_control( permoff_idx );
squeeze <= i_control( squeeze_idx );
lfsr_reset <= i_control( lfsr_reset_idx );
lfsr_en <= i_control( lfsr_en_idx );
-- post input: do input and domain separator and replace
post_input( a1_idx ) <= ace_state( a1_idx );
post_input( a0_idx ) <= ace_state( a0_idx );
post_input( b0_idx ) <= ace_state( b0_idx );
post_input( b1_idx ) <= ace_state( b1_idx );
post_input( c0_idx ) <= ace_state( c0_idx );
post_input( c1_idx ) <= ace_state( c1_idx );
post_input( d0_idx ) <= ace_state( d0_idx );
post_input( d1_idx ) <= ace_state( d1_idx );
post_input( e1_idx ) <= ace_state( e1_idx );
dsxor( 0 to half_word_sz - 3 ) <= ( others => '0' );
dsxor( half_word_sz - 2) <= i_dom_sep(1);
dsxor( half_word_sz - 1) <= i_dom_sep(0);
post_input( e0_idx ) <= dsxor xor ace_state( e0_idx ) when (i_valid = '1')
else ace_state( e0_idx );
o_data_vector(0) <= ace_state(a1_idx);
-- sb 64 ==> post round
pre_round <= half_words_to_words( post_input );
a_sb_64 :
entity work.sb_64 port map
( i_state => pre_round( a_idx )
, i_rc => ctl_const( lfsr_c_sz + 2 ) --rc0
, o_state => post_round( a_idx )
post_round( b_idx ) <= pre_round( b_idx );
c_sb_64 :
entity work.sb_64 port map
( i_state => pre_round( c_idx )
, i_rc => ctl_const( lfsr_c_sz + 1 ) --rc1
, o_state => post_round( c_idx )
post_round( d_idx ) <= pre_round( d_idx );
e_sb_64 :
entity work.sb_64 port map
( i_state => pre_round( e_idx )
, i_rc => ctl_const( lfsr_c_sz ) -- rc2
, o_state => post_round( e_idx )
-- XORs to the left ==> post xor
post_xor( a_idx ) <= post_round( a_idx );
post_xor( c_idx ) <= post_round( c_idx );
post_xor( b_idx ) <= post_round( b_idx ) xor post_round( c_idx );
post_xor( d_idx ) <= post_round( d_idx ) xor post_round( e_idx );
post_xor( e_idx ) <= post_round( e_idx ) xor post_round( a_idx );
-- XOR with step constant ==> post step const
post_step_const( a_idx ) <= post_xor( a_idx );
post_step_const( c_idx ) <= post_xor( c_idx );
post_step_const( b_idx)( 0 to 55) <= not post_xor( b_idx )( 0 to 55 );
post_step_const( b_idx)( 56 to 63) <= post_xor( b_idx )( 56 to 63 ) xor ctl_const( 2 to lfsr_c_sz + 2 ); -- sc0
post_step_const( d_idx)( 0 to 55) <= not post_xor( d_idx )( 0 to 55 );
post_step_const( d_idx)( 56 to 63) <= post_xor( d_idx )( 56 to 63 ) xor ctl_const( 1 to lfsr_c_sz + 1 ); -- sc1
post_step_const( e_idx)( 0 to 55) <= not post_xor( e_idx )( 0 to 55 );
post_step_const( e_idx)( 56 to 63) <= post_xor( e_idx )( 56 to 63 ) xor ctl_const( 0 to lfsr_c_sz ); --sc2
-- post liear layer pi = (3,2,0,4,1) ==> post linear
post_linear( a_idx ) <= post_step_const( d_idx );
post_linear( b_idx ) <= post_step_const( c_idx );
post_linear( c_idx ) <= post_step_const( a_idx );
post_linear( d_idx ) <= post_step_const( e_idx );
post_linear( e_idx ) <= post_step_const( b_idx );
-- update state
ace_path( a_idx ) <= post_linear( a_idx ) when endstep = '1' else post_round( a_idx );
ace_path( b_idx ) <= post_linear( b_idx ) when endstep = '1' else post_round( b_idx );
ace_path( c_idx ) <= post_linear( c_idx ) when endstep = '1' else post_round( c_idx );
ace_path( d_idx ) <= post_linear( d_idx ) when endstep = '1' else post_round( d_idx );
ace_path( e_idx ) <= post_linear( e_idx ) when endstep = '1' else post_round( e_idx );
wait until rising_edge( clk );
ace_state <= words_to_half_words(ace_path);
end process;
end architecture;
library ieee;
use ieee.std_logic_1164.all;
use work.ace_pkg.all;
entity lfsr_c is
( clk : in std_logic
; lfsr_c_en : in std_logic
; lfsr_c_reset : in std_logic
; o_const : out lfsr_c_output
end lfsr_c;
architecture rtl of lfsr_c is
signal sa: std_logic_vector(lfsr_c_sz - 1 downto 0);
signal xa: std_logic_vector(lfsr_c_sz + 2 downto 0);
-- 10 output bits for the constants
o_const <= xa; -- "to" type <= "downto" type. Index flip intended
-- just rename signal
xa(lfsr_c_sz-1 downto 0) <= sa(lfsr_c_sz-1 downto 0);
-- for updates and outputs
xa(lfsr_c_sz + 2 downto lfsr_c_sz) <= xa(3 downto 1) xor xa(2 downto 0);
lfsr_shift: for i in lfsr_c_sz-1 downto 0 generate
lfsr_step: process(clk) begin
if rising_edge(clk) then
if lfsr_c_reset ='1' then
sa(i) <= '1';
elsif lfsr_c_en ='1' then
sa(i) <= xa(i+3);
end if;
end if;
end process;
end generate lfsr_shift;
------------ ACE readme file ---------------
----- list of files for ACE synthesis: -----
ace_pkg.vhd -- main package
sb_64.vhd -- s-box with simeck
lfsr.vhd -- lfsr for step / round constant generation
ctl.vhd -- control (FSM)
dp.vhd -- datapath
ace.vhd -- top level entity declaration
ace-rtl.vhd -- top level architecture
----- additional files for simulation: -----
util_unsynth.vhd -- functions used in TB (general purpose)
ace_unsynth.vhd -- specific ACE functions and procedures used in TB
ace_tb.vhd -- ACE testbench
-------------- pure datapath ---------------
dp_pure.vhd -- datapath with most input/output multiplexers removed
----------- TB info (ace_tb.vhd): ----------
EDH is a 3-bit constant used to select which modes to test
"100" - encryption only
"010" - decyption only
"001" - hash only
"110" - encryption and decryption
stim_file_path -- stimulus file
output_file_path -- output file
------------ stimulus file format --------------
1 file = 1 set of Key, Nonce, AD, Plaintext and Ciphertext
K 00111122335588DD00111122335588DD <--- 128 bits of Key (all 128 bits in a single line)
N 111122335588DD00111122335588DD00 <--- 128 bits of Nonce (all 128 bits in a single line)
A 1122335588DD00111122335588DD00 <--- from 4 to 128 bits of AD
P 335588DD00111122335588DD001111 <--- from 4 to 128 bits of Plaintext
C F9362385DC213A07CEFEF38C34CEFF <--- from 4 to 128 bits of Ciphertext
--- padding is done by testbench
--- multiple lines for AD, Plaintext and Ciphertext are supported
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use work.ace_pkg.all;
entity sb_64 is
( i_state : in word
; i_rc : in std_logic
; o_state : out word
end entity;
architecture rtl of sb_64 is
signal x0, x1, z0, z1 : half_word;
signal rc : half_word;
x1 <= i_state( 0 to half_word_sz-1 );
x0 <= i_state( half_word_sz to word_sz - 1 );
rc <= ( 0 to half_word_sz - 2 => '1', half_word_sz - 1 => i_rc );
z0 <= x1;
z1 <= ( ( x1(5 to half_word_sz - 1) & x1 (0 to 4) ) and x1)
xor ( x1(1 to half_word_sz - 1) & x1 (0) )
xor x0
xor rc;
o_state <= z1 & z0;
end architecture;
NISTGCCFLAGS=-std=c99 -Wall -Wextra -Wshadow -fsanitize=address,undefined -O2
all: ace128_1
ace128_1: ace128
ace128: genkat_aead.c encrypt.c ace.c
$(CC) $(NISTGCCFLAGS) -o $@ $^ $(LFLAGS)
.PHONY: clean
-rm ace128
/* Reference implementation of the ace-320 permutation
Written by:
Kalikinkar Mandal <>
#include "ace.h"
static const unsigned char SC0[16]={0x50,0x5c,0x91,0x8d,0x53,0x60,0x68,0xe1,0xf6,0x9d,0x40,0x4f,0xbe,0x5b,0xe9,0x7f}; //Step constants (SC_{2i})
static const unsigned char SC1[16]={0x28,0xae,0x48,0xc6,0xa9,0x30,0x34,0x70,0x7b,0xce,0x20,0x27,0x5f,0xad,0x74,0x3f}; //Step constants (SC_{2i+1})
static const unsigned char SC2[16]={0x14,0x57,0x24,0x63,0x54,0x18,0x9a,0x38,0xbd,0x67,0x10,0x13,0x2f,0xd6,0xba,0x1f}; //Step constants (SC_{2i+2})
static const unsigned char RC0[16]={0x07,0x0a,0x9b,0xe0,0xd1,0x1a,0x22,0xf7,0x62,0x96,0x71,0xaa,0x2b,0xe9,0xcf,0xb7};//Round constants (RC_{2i})
static const unsigned char RC1[16]={0x53,0x5d,0x49,0x7f,0xbe,0x1d,0x28,0x6c,0x82,0x47,0x6b,0x88,0xdc,0x8b,0x59,0xc6};//Round constants (RC_{2i+1})
static const unsigned char RC2[16]={0x43,0xe4,0x5e,0xcc,0x32,0x4e,0x75,0x25,0xfd,0xf9,0x76,0xa0,0xb0,0x09,0x1e,0xad};//Round constants (RC_{2i+2})
unsigned char rotl8 ( const unsigned char x, const unsigned char y, const unsigned char shift )
return ((x<<shift)|(y>>(8-shift)));
******* ACE permutation implementation ********************
void ace_print_state( const unsigned char *state )
unsigned char i;
for ( i = 0; i < STATEBYTES; i++ )
printf("%02X", state[i]);
void ace_print_data(const uint8_t *x, const uint32_t xlen )
uint32_t j;
for ( j = 0; j < xlen; j++ )
printf("%.2x ", x[j]);
void simeck64_box( unsigned char *output, const unsigned char *input, const unsigned char rc )
unsigned char i, t;
unsigned char *tmp_shift_1, *tmp_shift_5, *tmp_pt;
tmp_shift_1 = (unsigned char *)malloc(4*sizeof(unsigned char));
tmp_shift_5 = (unsigned char *)malloc(4*sizeof(unsigned char));
tmp_pt = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
for ( i = 0; i < SIMECKBYTES; i++ )
tmp_pt[i] = input[i];
for ( i = 0; i < SIMECKROUND; i++ )
tmp_shift_1[0] = rotl8(tmp_pt[0], tmp_pt[1],1);
tmp_shift_1[1] = rotl8(tmp_pt[1], tmp_pt[2],1);
tmp_shift_1[2] = rotl8(tmp_pt[2], tmp_pt[3],1);
tmp_shift_1[3] = rotl8(tmp_pt[3], tmp_pt[0],1);
tmp_shift_5[0] = rotl8(tmp_pt[0], tmp_pt[1],5);
tmp_shift_5[1] = rotl8(tmp_pt[1], tmp_pt[2],5);
tmp_shift_5[2] = rotl8(tmp_pt[2], tmp_pt[3],5);
tmp_shift_5[3] = rotl8(tmp_pt[3], tmp_pt[0],5);
tmp_shift_5[0] = tmp_shift_5[0]&tmp_pt[0];
tmp_shift_5[1] = tmp_shift_5[1]&tmp_pt[1];
tmp_shift_5[2] = tmp_shift_5[2]&tmp_pt[2];
tmp_shift_5[3] = tmp_shift_5[3]&tmp_pt[3];
tmp_shift_1[0] = tmp_shift_1[0]^tmp_shift_5[0];
tmp_shift_1[1] = tmp_shift_1[1]^tmp_shift_5[1];
tmp_shift_1[2] = tmp_shift_1[2]^tmp_shift_5[2];
tmp_shift_1[3] = tmp_shift_1[3]^tmp_shift_5[3];
tmp_shift_1[0] = tmp_shift_1[0]^tmp_pt[4]^(0xff);
tmp_shift_1[1] = tmp_shift_1[1]^tmp_pt[5]^(0xff);
tmp_shift_1[2] = tmp_shift_1[2]^tmp_pt[6]^(0xff);
tmp_shift_1[3] = tmp_shift_1[3]^tmp_pt[7]^(0xfe);
t = (rc >> i)&1;
tmp_shift_1[3] = tmp_shift_1[3]^t;
tmp_pt[4] = tmp_pt[0];
tmp_pt[5] = tmp_pt[1];
tmp_pt[6] = tmp_pt[2];
tmp_pt[7] = tmp_pt[3];
tmp_pt[0] = tmp_shift_1[0];
tmp_pt[1] = tmp_shift_1[1];
tmp_pt[2] = tmp_shift_1[2];
tmp_pt[3] = tmp_shift_1[3];
//simeck_print_data(tmp_pt, 8);
for ( i = 0; i < SIMECKBYTES; i++ )
output[i] = tmp_pt[i];
void ace_permutation( unsigned char *input )
unsigned char i, j;
unsigned char *tmp_inp, *tmp_a, *tmp_c, *tmp_e;
tmp_inp = (unsigned char *)malloc(STATEBYTES*sizeof(unsigned char));
tmp_a = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
tmp_c = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
tmp_e = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
for ( i = 0; i < STATEBYTES; i++ )
tmp_inp[i] = input[i];
for ( i = 0; i < NUMSTEPS; i++ )
//A block
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_a[j] = tmp_inp[j];
simeck64_box( tmp_a, tmp_a, RC0[i] );
//C block
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_c[j] = tmp_inp[2*SIMECKBYTES+j];
simeck64_box( tmp_c, tmp_c, RC1[i] );
//E block
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_e[j] = tmp_inp[4*SIMECKBYTES+j];
simeck64_box( tmp_e, tmp_e, RC2[i] );
// Update A: A <= SC_{3i+1}+D+F(E)
for ( j = 0; j < SIMECKBYTES-1; j++ )
tmp_inp[j] = tmp_inp[3*SIMECKBYTES+j]^tmp_e[j]^(0xff);
tmp_inp[SIMECKBYTES-1] = tmp_inp[4*SIMECKBYTES-1]^tmp_e[SIMECKBYTES-1]^SC1[i];
// Update E: E <= SC_{3i}+B+F(C)
for ( j = 0; j < SIMECKBYTES-1; j++ )
tmp_inp[4*SIMECKBYTES+j] = tmp_inp[SIMECKBYTES+j]^tmp_c[j]^(0xff);
tmp_inp[5*SIMECKBYTES-1] = tmp_inp[2*SIMECKBYTES-1]^tmp_c[SIMECKBYTES-1]^SC0[i];
// Update B: B <= F(C)
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_inp[SIMECKBYTES+j] = tmp_c[j];
// Update C: C <= F(A)
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_inp[2*SIMECKBYTES+j] = tmp_a[j];
// Update D: D <= SC_{3i+2}+F(A)+F(E)
for ( j = 0; j < SIMECKBYTES-1; j++ )
tmp_inp[3*SIMECKBYTES+j] = tmp_a[j]^tmp_e[j]^(0xff);
tmp_inp[4*SIMECKBYTES-1] = tmp_a[SIMECKBYTES-1]^tmp_e[SIMECKBYTES-1]^SC2[i];
//ace_print_state(tmp_inp); // Printing intermediate state
for ( i = 0; i < STATEBYTES; i++ )
input[i] = tmp_inp[i];
void ace_permutation_ALLZERO ( unsigned char *state )
unsigned char i;
for ( i = 0; i < STATEBYTES; i++ )
state[i] = 0x0;
void ace_permutation_ALLONE ( unsigned char *state )
unsigned char i;
for ( i = 0; i < STATEBYTES; i++ )
state[i] = 0xff;
//ace_print_state( state );
/* Reference implementation of the ACE permutation
Written by:
Kalikinkar Mandal <>
#ifndef ACE_H
#define ACE_H
#define STATEBYTES 40 //Number OF BYTES = 320/8 = 40
#define SIMECKBYTES 8 //Number of Simeck BYTES = 64/8 = 8
#define SIMECKROUND 8 //Number of rounds
#define NUMSTEPS 16 //Number of steps
typedef unsigned long long u64;
unsigned char rotl8 ( const unsigned char x, const unsigned char y, const unsigned char shift );
void ace_print_data(const unsigned char *x, const uint32_t xlen );
void simeck_print_data(const unsigned char *y, const unsigned char ylen );
void simeck64_box( unsigned char *output, const unsigned char *input, const unsigned char rc );
void ace_permutation( unsigned char *input );
void ace_print_state( const unsigned char *state );
void ace_permutation_ALLZERO ( unsigned char *state );
void ace_permutation_ALLONE ( unsigned char *state );
#define CRYPTO_ABYTES 16
/* Reference implementation of ACE-128 AEAD
Written by:
Kalikinkar Mandal <>
typedef unsigned long long u64;
int ace_init(
unsigned char *state,
const unsigned char *npub,
const unsigned char *k
int ace_ad(
unsigned char *state,
const unsigned char *ad,
const u64 adlen
int ace_gentag(
unsigned char *tag,
const unsigned char tlen,
unsigned char *state,
const unsigned char *k
int crypto_encrypt(
unsigned char *c,unsigned long long *clen,
const unsigned char *m,unsigned long long mlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
int crypto_decrypt(
unsigned char *m,unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c,unsigned long long clen,
const unsigned char *npub,
const unsigned char *k
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
int ace_gentag(
unsigned char *tag,
const unsigned char tlen,
unsigned char *state,
const unsigned char *k
/* Reference implementation of ACE-128 AEAD
Written by:
Kalikinkar Mandal <>
#include "ace.h"
#include "crypto_aead.h"
#include "api.h"
#define KAT_SUCCESS 0
#define KAT_DATA_ERROR -3
const unsigned char rate_bytes[8] = {0,1,2,3,16,17,18,19};
int ace_init(
unsigned char *state,
const unsigned char *npub,
const unsigned char *k
unsigned char i;
//Initialize the state to all-ZERO
for ( i = 0; i < STATEBYTES; i++ )
state[i] = 0x0;
//Assigning key at A[0..7] & C[0..7]
for ( i = 0; i < 8; i++ )
state[i] = k[i];
for ( i = 0; i < 8; i++ )
state[16+i] = k[8+i];
//Assigning nonce at B[0..7] & E[0..7]
for ( i = 0; i < 8; i++ )
state[8+i] = npub[i];
for ( i = 0; i < 8; i++ )
state[32+i] = npub[8+i];
for ( i = 0; i < 8; i++ )
for ( i = 0; i < 8; i++ )
int ace_ad(
unsigned char *state,
const unsigned char *ad,
const u64 adlen
unsigned char i, lblen;
u64 j, ad64len = adlen/8;
lblen = (unsigned char)(adlen%8);
if ( adlen == 0 )
//Absorbing associated data
for ( j = 0; j < ad64len; j++ )
for ( i = 0; i < 8; i++ )
//Domain seperator
//Process the last 64-bit block.
if ( lblen != 0 )
for ( i = 0; i < lblen; i++ )
state[rate_bytes[lblen]]^=(0x80); //Padding: 10*
//Domain seperator
ace_permutation(state );
state[rate_bytes[0]]^=(0x80); //Padding: 10*
//Domain seperator
ace_permutation(state );
return (KAT_SUCCESS);
int ace_gentag(
unsigned char *tag,
const unsigned char tlen,
unsigned char *state,
const unsigned char *k
unsigned char i;
if ( CRYPTO_KEYBYTES == 16 && tlen == 16 )
for ( i = 0; i < 8; i++ )
for ( i = 0; i < 8; i++ )
//Extracting 128-bit tag from X1 and X3
for ( i = 0; i < 8; i++ )
tag[i] = state[i];
tag[8+i] = state[16+i];
printf("Invalid key and tag length pair.\n");
int crypto_aead_encrypt(
unsigned char *c,unsigned long long *clen,
const unsigned char *m,unsigned long long mlen,
const unsigned char *ad,unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
unsigned char *state;
unsigned char *tag;
unsigned char i, lblen;
u64 j, m64len;
m64len = mlen/8;
lblen = (unsigned char)(mlen%8);
state = (unsigned char *)malloc(sizeof(unsigned char)*STATEBYTES);
tag = (unsigned char *)malloc(sizeof(unsigned char)*CRYPTO_ABYTES);
//Initialize state with "key" and "nonce" and then absorbe "key" again
if ( ace_init(state, npub, k)!= KAT_SUCCESS )
//Absorbing "ad"
if ( adlen != 0 )
if ( ace_ad( state, ad, adlen) != KAT_SUCCESS)
//Encrypting "message(m)" and producing "ciphertext (c)"
if ( mlen != 0 )
for ( j = 0; j < m64len; j++ )
for ( i = 0; i < 8; i++ )
c[8*j+((u64)i)] = m[8*j+((u64)i)]^state[rate_bytes[i]];
state[rate_bytes[i]] = c[8*j+((u64)i)];
//Domain seperator
if ( lblen != 0 )
//Encrypting the padded 64-bit block when "mlen" is not a multiple of 8
for ( i = 0; i < lblen; i++ )
c[8*m64len+((u64)i)] = m[m64len*8+(u64)i]^state[rate_bytes[i]];
state[rate_bytes[i]] = c[8*m64len+((u64)i)];
state[rate_bytes[lblen]]^=(0x80); //Padding: 10*
//Domain seperator
state[rate_bytes[0]]^=(0x80); //Padding: 10*
//Domain seperator
ace_permutation(state );
state[rate_bytes[0]]^=(0x80); //Padding: 10*
//Domain seperator
ace_permutation(state );
//Appending tag to the end of ciphertext
if ( ace_gentag( tag, CRYPTO_ABYTES, state, k ) != KAT_SUCCESS )
for ( i = 0; i < CRYPTO_ABYTES; i++ )
c[mlen+(u64)i] = tag[i];
*clen = mlen+CRYPTO_ABYTES;
/*printf("Print tag after enc.:\n");
for ( i = 0; i < 16; i++ )
printf("%.2X", tag[i]);
int crypto_aead_decrypt(
unsigned char *m,unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c,unsigned long long clen,
const unsigned char *ad,unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
unsigned char i, lblen;
u64 j, clen1, c64len;
clen1 = clen-CRYPTO_ABYTES;
c64len = clen1/8;
lblen = (unsigned char)(clen1%8);
unsigned char *state;
unsigned char *tag;
state = (unsigned char *)malloc(sizeof(unsigned char)*STATEBYTES);
tag = (unsigned char *)malloc(sizeof(unsigned char)*CRYPTO_ABYTES);
//Initialize state with "key" and "nonce" and then absorbe "key" again
if ( ace_init(state, npub, k)!= KAT_SUCCESS )
//Absorbing "ad"
if ( adlen != 0 )
if ( ace_ad( state, ad, adlen) != KAT_SUCCESS)
if ( clen1 != 0 )
for ( j = 0; j < c64len; j++ )
for ( i = 0; i < 8; i++ )
m[8*j+((u64)i)] = c[8*j+((u64)i)]^state[rate_bytes[i]];
state[rate_bytes[i]] = c[8*j+((u64)i)];
//Domain seperator
if ( lblen != 0 )
//Decrypting last 64-bit block
for ( i = 0; i < lblen; i++ )
m[8*c64len +((u64)i)] = c[8*c64len +((u64)i)]^state[rate_bytes[i]];
state[rate_bytes[i]] = c[8*c64len +((u64)i)];
state[rate_bytes[i]]^=(0x80); //Padding: 10*
//Domain seperator
state[rate_bytes[0]]^=(0x80); //Padding: 10*
//Domain seperator
ace_permutation(state );
state[rate_bytes[0]]^=(0x80); //Padding: 10*
//Domain seperator
ace_permutation(state );
//Generating and verifying the tag
if ( ace_gentag( tag, CRYPTO_ABYTES, state, k ) != KAT_SUCCESS )
for ( i = 0; i < CRYPTO_ABYTES; i++ )
if ( c[clen1 + (u64)i] != tag[i] )
*mlen = clen-CRYPTO_ABYTES;
/*printf("Print tag after dec.:\n");
for ( i = 0; i < 16; i++ )
printf("%.2X", tag[i]);
// disable deprecation for sprintf and fopen
#ifdef _MSC_VER
#include <stdio.h>
#include <string.h>
#include "crypto_aead.h"
#include "api.h"
#define KAT_SUCCESS 0
#define KAT_DATA_ERROR -3
#define MAX_FILE_NAME 256
void init_buffer(unsigned char *buffer, unsigned long long numbytes);
void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length);
int generate_test_vectors();
int main()
int ret = generate_test_vectors();
if (ret != KAT_SUCCESS) {
fprintf(stderr, "test vector generation failed with code %d\n", ret);
return ret;
int generate_test_vectors()
FILE *fp;
char fileName[MAX_FILE_NAME];
unsigned char key[CRYPTO_KEYBYTES];
unsigned char nonce[CRYPTO_NPUBBYTES];
unsigned char msg[MAX_MESSAGE_LENGTH];
unsigned char msg2[MAX_MESSAGE_LENGTH];
unsigned long long clen, mlen2;
int count = 1;
int func_ret, ret_val = KAT_SUCCESS;
init_buffer(key, sizeof(key));
init_buffer(nonce, sizeof(nonce));
init_buffer(msg, sizeof(msg));
init_buffer(ad, sizeof(ad));
sprintf(fileName, "../LWC_AEAD_KAT_%d_%d.txt", (CRYPTO_KEYBYTES * 8), (CRYPTO_NPUBBYTES * 8));
if ((fp = fopen(fileName, "w")) == NULL) {
fprintf(stderr, "Couldn't open <%s> for write\n", fileName);
for (unsigned long long mlen = 0; (mlen <= MAX_MESSAGE_LENGTH) && (ret_val == KAT_SUCCESS); mlen++) {
for (unsigned long long adlen = 0; adlen <= MAX_ASSOCIATED_DATA_LENGTH; adlen++) {
fprintf(fp, "Count = %d\n", count++);
fprint_bstr(fp, "Key = ", key, CRYPTO_KEYBYTES);
fprint_bstr(fp, "Nonce = ", nonce, CRYPTO_NPUBBYTES);
fprint_bstr(fp, "PT = ", msg, mlen);
fprint_bstr(fp, "AD = ", ad, adlen);
if ((func_ret = crypto_aead_encrypt(ct, &clen, msg, mlen, ad, adlen, NULL, nonce, key)) != 0) {
fprintf(fp, "crypto_aead_encrypt returned <%d>\n", func_ret);
fprint_bstr(fp, "CT = ", ct, clen);
fprintf(fp, "\n");
if ((func_ret = crypto_aead_decrypt(msg2, &mlen2, NULL, ct, clen, ad, adlen, nonce, key)) != 0) {
fprintf(fp, "crypto_aead_decrypt returned <%d>\n", func_ret);
if (mlen != mlen2) {
fprintf(fp, "crypto_aead_decrypt returned bad 'mlen': Got <%llu>, expected <%llu>\n", mlen2, mlen);
if (memcmp(msg, msg2, mlen)) {
fprintf(fp, "crypto_aead_decrypt did not recover the plaintext\n");
return ret_val;
void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length)
fprintf(fp, "%s", label);
for (unsigned long long i = 0; i < length; i++)
fprintf(fp, "%02X", data[i]);
fprintf(fp, "\n");
void init_buffer(unsigned char *buffer, unsigned long long numbytes)
for (unsigned long long i = 0; i < numbytes; i++)
buffer[i] = (unsigned char)i;
This source diff could not be displayed because it is too large. You can view the blob instead.
#CC=gcc -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse3 -mmmx -mavx -mavx2
#CC=gcc -Wall -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse2 -mmmx -mavx -mavx2
CC=gcc -Wall -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -mavx -mavx2
#CC=gcc -O1 -fomit-frame-pointer -march=haswell -mtune=native -mavx
#ivybridgei, skylake, sandybridge, haswell
all: aceavx_1
aceavx: speed.c ace.c
$(CC) -o $@ $^
.PHONY: clean
rm -fr aceavx
/* Reference implementation of ACE-Hash256
Written by:
Kalikinkar Mandal <>
#ifndef ACE_H
#define ACE_H
#define STATEBYTES 40
#define STATEDWORD 10 // 320/32 = 8//
//#define NUMSTEPS 16
#define NUMSTEPS 16
#define PARAL_INST_BY8 1
static const unsigned char SC0[16]={0x50,0x5c,0x91,0x8d,0x53,0x60,0x68,0xe1,0xf6,0x9d,0x40,0x4f,0xbe,0x5b,0xe9,0x7f}; //Step constants (SC_{2i})
static const unsigned char SC1[16]={0x28,0xae,0x48,0xc6,0xa9,0x30,0x34,0x70,0x7b,0xce,0x20,0x27,0x5f,0xad,0x74,0x3f}; //Step constants (SC_{2i+1})
static const unsigned char SC2[16]={0x14,0x57,0x24,0x63,0x54,0x18,0x9a,0x38,0xbd,0x67,0x10,0x13,0x2f,0xd6,0xba,0x1f}; //Step constants (SC_{2i+2})
static const unsigned char RC0[16]={0x07,0x0a,0x9b,0xe0,0xd1,0x1a,0x22,0xf7,0x62,0x96,0x71,0xaa,0x2b,0xe9,0xcf,0xb7};//Round constants (RC_{2i})
static const unsigned char RC1[16]={0x53,0x5d,0x49,0x7f,0xbe,0x1d,0x28,0x6c,0x82,0x47,0x6b,0x88,0xdc,0x8b,0x59,0xc6};//Round constants (RC_{2i+1})
static const unsigned char RC2[16]={0x43,0xe4,0x5e,0xcc,0x32,0x4e,0x75,0x25,0xfd,0xf9,0x76,0xa0,0xb0,0x09,0x1e,0xad};//Round constants (RC_{2i+2})
typedef unsigned long long int u64;
typedef unsigned int u32;
typedef unsigned int u8;
#define ROT5(x) (_mm256_slli_epi32(x, 5) | _mm256_srli_epi32(x, 27))
#define ROT1(x) (_mm256_slli_epi32(x, 1) | _mm256_srli_epi32(x, 31))
#define SWAPREG1(x) (_mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)))
#define RC(t1, t2) (_mm256_set_epi32(0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2,0xfffffffe^t1))
#define SC(t1, t2) (_mm256_set_epi32(0xffffff00^t2, 0xffffffff, 0xffffff00^t1, 0xffffffff, 0xffffff00^t2, 0xffffffff, 0xffffff00^t1, 0xffffffff))
#define SWAPBLK(x) (_mm256_permute4x64_epi64(x, _MM_SHUFFLE(2,3,0,1)))
#define SWAPREG2(x) (_mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0)))
#define SWAPAC(x) (_mm256_permutevar8x32_epi32(xtmp, _mm256_set_epi32(5, 4, 7, 6, 1, 0, 3, 2)))
#define masklo (_mm256_set_epi32(0x0, 0x0, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff,0xffffffff))
#define maskhi (_mm256_set_epi32(0xffffffff, 0xffffffff, 0xffffffff,0xffffffff,0x0, 0x0, 0x0, 0x0))
#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
#define ROAX(x, y, t1, t2)\
__m256i x2tmp;\
x2tmp = x;\
x = (ROT5(x)&x)^ROT1(x)^RC(t1, t2)^y;\
y = x2tmp;\
#define PACK_SSb(x, y)\
__m256i xtmp, ytmp;\
xtmp = SWAPREG1(x);\
ytmp = SWAPREG1(y);\
x = _mm256_permute2x128_si256(xtmp,ytmp,0x20);\
y = _mm256_permute2x128_si256(xtmp,ytmp, 0x31);\
#define UNPACK_SSb(x, y)\
__m256i xtmp, ytmp;\
xtmp = _mm256_unpacklo_epi32(x, y);\
ytmp = _mm256_unpackhi_epi32(x, y);\
x = _mm256_permute2x128_si256(xtmp, ytmp,0x20);\
y = _mm256_permute2x128_si256(xtmp, ytmp, 0x31);\
#define PACK(x, y, z, w)\
__m256i x2tmp, x3tmp;\
x2tmp = SWAPREG2(x);\
x3tmp = SWAPREG2(z);\
x = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
z = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
x2tmp = SWAPREG2(y);\
x3tmp = SWAPREG2(w);\
y = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
w = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
#define UNPACK(x,y,z,w)\
__m256i x2tmp, x3tmp;\
x2tmp = _mm256_unpacklo_epi64(x, z);\
x3tmp = _mm256_unpackhi_epi64(x, z);\
x = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
z = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
x2tmp = _mm256_unpacklo_epi64(y, w);\
x3tmp = _mm256_unpackhi_epi64(y, w);\
y = _mm256_permute2x128_si256(x2tmp,x3tmp,0x20);\
w = _mm256_permute2x128_si256(x2tmp,x3tmp, 0x31);\
void ace320( u32 *state );
int crypto_hash( u32 *out, u32 *in, u64 inlen );
typedef unsigned long long int u64;
u64 start_rdtsc( )
unsigned high, low;
__asm__ volatile("CPUID\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (high),
"=r" (low):: "%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
u64 end_rdtsc( )
unsigned high, low;
__asm__ volatile("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax,%1\n\t"
"CPUID\n\t": "=r" (high), "=r" (low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
static inline u64 cpucycles( )
u64 result;
asm volatile (".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
: "=a" (result) :: "%rdx");
return result;
/*#ifdef __x86_64__
#define mycpucycles(RES) \
__asm__ volatile("rdtsc;shlq $32,%%rdx;orq %%rdx,%%rax" : "=a" (RES) :: "%rdx");
#define mycpucycles(RES) \
__asm__ volatile(".byte 15;.byte 49" : "=A" (RES));
\ No newline at end of file
/* Reference implementation of ACE-Hash256
Written by:
Kalikinkar Mandal <>
#include "ace.h"
#include "clock_cycle.h"
#define NUM_ITER 2000
#define NUM_TEST 500
void print_state ( u32 *state )
u8 i, j;
for ( j = 0; j < 8*PARAL_INST_BY8; j++ )
for ( i = 0; i < STATEDWORD; i++ )
printf("%.8X", state[i+j*STATEDWORD]);
int main()
u8 num_parallel_inst;
u32 *state;
int i, j;
u64 t[NUM_ITER+1], count_cc;
u32 *plaintext, *digest;
u32 hlen;
u64 plen;
num_parallel_inst = 8*PARAL_INST_BY8;
plen = 32; // Message length = plen*32 bits;
hlen = 8; //256 = 32*8 bits
digest = (u32 *)malloc(sizeof(u32)*hlen*num_parallel_inst);
plaintext = (u32 *)malloc(sizeof(u32)*plen*num_parallel_inst);
state = (u32 *)malloc(sizeof(u32)*num_parallel_inst*STATEDWORD);
//Randomly generating messages
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
plaintext[i*plen+j] = j%128;
// Hash Module//
for ( i = 0; i < NUM_ITER; i++ )
//plaintext[0] = plaintext[0]^i;
count_cc = start_rdtsc();
crypto_hash( digest, plaintext, plen );
count_cc = end_rdtsc()-count_cc;
printf("Hash speed = %f cbp\n", (double)(count_cc)/(double)(num_parallel_inst*plen*4));
printf("Hash speed = %f cbp\n", (double)(count_cc)/(double)(num_parallel_inst*plen*4));
//plen = 0;
crypto_hash( digest, plaintext, plen );
printf("Original plaintext:\n");
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", plaintext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < hlen; j++ )
printf("%08X", digest[i*hlen+j]);
#CC=gcc -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse3 -mmmx -mavx -mavx2
#CC=gcc -Wall -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse2 -mmmx -mavx -mavx2
CC=gcc -Wall -O2 -fomit-frame-pointer -funroll-all-loops -march=native -mtune=native -msse2
#CC=gcc -O1 -fomit-frame-pointer -march=haswell -mtune=native -mavx
#ivybridgei, skylake, sandybridge, haswell
all: acesse2_1
acesse2: speed.c ace.c
$(CC) -o $@ $^
.PHONY: clean
rm -fr acesse2
/* Reference implementation of ACE-Hash256
Written by:
Kalikinkar Mandal <>
#ifndef ACE_H
#define ACE_H
#define STATEBYTES 40
#define STATEDWORD 10 // 320/32 = 8//
#define NUMSTEPS 16
#define PARAL_INST_BY4 1
static const unsigned char SC0[16]={0x50,0x5c,0x91,0x8d,0x53,0x60,0x68,0xe1,0xf6,0x9d,0x40,0x4f,0xbe,0x5b,0xe9,0x7f}; //Step constants (SC_{2i})
static const unsigned char SC1[16]={0x28,0xae,0x48,0xc6,0xa9,0x30,0x34,0x70,0x7b,0xce,0x20,0x27,0x5f,0xad,0x74,0x3f}; //Step constants (SC_{2i+1})
static const unsigned char SC2[16]={0x14,0x57,0x24,0x63,0x54,0x18,0x9a,0x38,0xbd,0x67,0x10,0x13,0x2f,0xd6,0xba,0x1f}; //Step constants (SC_{2i+2})
static const unsigned char RC0[16]={0x07,0x0a,0x9b,0xe0,0xd1,0x1a,0x22,0xf7,0x62,0x96,0x71,0xaa,0x2b,0xe9,0xcf,0xb7};//Round constants (RC_{2i})
static const unsigned char RC1[16]={0x53,0x5d,0x49,0x7f,0xbe,0x1d,0x28,0x6c,0x82,0x47,0x6b,0x88,0xdc,0x8b,0x59,0xc6};//Round constants (RC_{2i+1})
static const unsigned char RC2[16]={0x43,0xe4,0x5e,0xcc,0x32,0x4e,0x75,0x25,0xfd,0xf9,0x76,0xa0,0xb0,0x09,0x1e,0xad};//Round constants (RC_{2i+2})
typedef unsigned long long int u64;
typedef unsigned int u32;
typedef unsigned int u8;
#define ROT5(x) (_mm_slli_epi32(x, 5) | _mm_srli_epi32(x, 27))
#define ROT1(x) (_mm_slli_epi32(x, 1) | _mm_srli_epi32(x, 31))
#define RC(t1, t2) (_mm_set_epi32(0xfffffffe^t2, 0xfffffffe^t1, 0xfffffffe^t2, 0xfffffffe^t1))
#define SC(t1, t2) (_mm_set_epi32(0xffffff00^t2, 0xffffffff, 0xffffff00^t1, 0xffffffff ))
#define SWAPREG1(x) (_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 1, 2, 0)))
#define SWAPBLK(x) (_mm_slli_si128(x, 8)|_mm_srli_si128(x, 8))
#define masklo (_mm_set_epi32(0x0, 0x0, 0xffffffff, 0xffffffff ))
#define maskhi (_mm_set_epi32(0xffffffff, 0xffffffff, 0x0, 0x0 ))
#define ROAX(x, y, t1, t2)\
__m128i xtmp;\
xtmp = x;\
x = (ROT5(x)&x)^ROT1(x)^RC(t1, t2)^y;\
y = xtmp;\
#define PACK_SSb(x, y)\
__m128i xtmp, ytmp;\
xtmp = SWAPREG1(x);\
ytmp = SWAPREG1(y);\
x = _mm_unpacklo_epi64(xtmp, ytmp);\
y = _mm_unpackhi_epi64(xtmp, ytmp);\
#define UNPACK_SSb(x, y)\
__m128i xtmp, ytmp;\
xtmp = _mm_unpacklo_epi32(x, y);\
ytmp = _mm_unpackhi_epi32(x, y);\
x = xtmp;\
y = ytmp;\
#define PACK(x, y, z, w, state, i1, i2, i3, i4)\
__m128i xtmp, ytmp;\
xtmp = _mm_loadu_si128((void *) (state + i1));\
ytmp = _mm_loadu_si128((void *) (state + i2));\
x = _mm_unpacklo_epi64(xtmp, ytmp);\
z = _mm_unpackhi_epi64(xtmp, ytmp);\
xtmp = _mm_loadu_si128((void *) (state + i3));\
ytmp = _mm_loadu_si128((void *) (state + i4));\
y = _mm_unpacklo_epi64(xtmp, ytmp);\
w = _mm_unpackhi_epi64(xtmp, ytmp);\
#define UNPACK(x, y, z, w)\
__m128i xtmp, ytmp;\
xtmp = _mm_unpacklo_epi64(x, z);\
ytmp = _mm_unpackhi_epi64(x, z);\
x = xtmp;\
z = ytmp;\
xtmp = _mm_unpacklo_epi64(y, w);\
ytmp = _mm_unpackhi_epi64(y, w);\
y = xtmp;\
w = ytmp;\
void ace320( u32 *state );
int crypto_hash( u32 *out, u32 *in, u64 inlen );
typedef unsigned long long int u64;
u64 start_rdtsc( )
unsigned high, low;
__asm__ volatile("CPUID\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (high),
"=r" (low):: "%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
u64 end_rdtsc( )
unsigned high, low;
__asm__ volatile("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax,%1\n\t"
"CPUID\n\t": "=r" (high), "=r" (low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ( ((u64)low) | (((u64)high) << 32));
static inline u64 cpucycles( )
u64 result;
asm volatile (".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
: "=a" (result) :: "%rdx");
return result;
/*#ifdef __x86_64__
#define mycpucycles(RES) \
__asm__ volatile("rdtsc;shlq $32,%%rdx;orq %%rdx,%%rax" : "=a" (RES) :: "%rdx");
#define mycpucycles(RES) \
__asm__ volatile(".byte 15;.byte 49" : "=A" (RES));
\ No newline at end of file
/* Reference implementation of ACE-Hash256
Written by:
Kalikinkar Mandal <>
#include "ace.h"
#include "clock_cycle.h"
#define NUM_ITER 2000
#define NUM_TEST 500
void print_state ( u32 *state )
u8 i, j;
for ( j = 0; j < 4*PARAL_INST_BY4; j++ )
for ( i = 0; i < STATEDWORD; i++ )
printf("%.8X", state[i+j*STATEDWORD]);
int main()
u8 num_parallel_inst;
u32 *state;
int i, j;
u64 t[NUM_ITER+1], count_cc;
u32 *plaintext, *digest;
u64 plen;
u32 hlen;
num_parallel_inst = 4*PARAL_INST_BY4;
plen = 32; // Message length = plen*32;
hlen = 8; //256 = 32*8 bits
digest = (u32 *)malloc(sizeof(u32)*hlen*num_parallel_inst);
plaintext = (u32 *)malloc(sizeof(u32)*plen*num_parallel_inst);
state = (u32 *)malloc(sizeof(u32)*num_parallel_inst*STATEDWORD);
//Randomly generating messages//
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
plaintext[i*plen+j] = j%128;
// HASH Mode//
//Testing speed for ACE-Hash
for ( i = 0; i < NUM_ITER; i++ )
count_cc = start_rdtsc();
crypto_hash ( digest, plaintext, plen );
count_cc = end_rdtsc()-count_cc;
printf("Hash speed = %f cpb\n", (double)(count_cc)/(double)(num_parallel_inst*plen*4));
printf("Hash speed = %f cbp\n", (double)(count_cc)/(double)(num_parallel_inst*plen*4));
//Conputing hash
crypto_hash ( digest, plaintext, plen );
printf("Original plaintext:\n");
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < plen; j++ )
printf("%08X", plaintext[i*plen+j]);
for ( i = 0; i < num_parallel_inst; i++ )
for ( j = 0; j < hlen; j++ )
printf("%08X", digest[i*hlen+j]);
architecture rtl of ace is
signal ctl_control : ace_ctl_ty;
signal ctl_onehot : onehot_ty;
signal ctl_lfsr_en : std_logic;
signal ctl_lfsr_reset : std_logic;
u_dp :
entity work.dp port map
( clk => clk
, reset => reset
, i_mode => i_mode
, i_control => ctl_control
, i_onehot => ctl_onehot
, i_dom_sep => i_dom_sep
, i_valid => i_valid
, i_data => i_data
, i_padding => i_padding
, o_data => o_data
u_ctl :
entity work.ctl port map
( clk => clk
, reset => reset
, i_mode => i_mode
, i_dom_sep => i_dom_sep
, i_valid => i_valid
, i_padding => i_padding
, o_valid => o_valid
, o_onehot => ctl_onehot
, o_ready => o_ready
, o_control => ctl_control
end architecture;
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use work.ace_pkg.all;
entity ace is
( clk : in std_logic;
reset : in std_logic;
i_mode : in mode_ty;
i_dom_sep : in domsep_ty;
i_valid : in std_logic;
i_data : in word;
i_padding : in std_logic;
o_valid : out std_logic;
o_ready : out std_logic;
o_data : out word
end entity;
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
package ace_pkg is
--for constants
constant lfsr_c_sz : integer := 7;
subtype lfsr_c_output is std_logic_vector(0 to lfsr_c_sz+2);
constant half_word_sz : natural := 32;
constant word_sz : natural := 2*half_word_sz;
subtype half_word is std_logic_vector( 0 to half_word_sz - 1 );
subtype word is std_logic_vector( 0 to word_sz - 1 );
type word_vector is array( natural range <> ) of word;
type half_word_vector is array( natural range <> ) of half_word;
-- A, B, C, D, E
constant state_sz : natural := 320;
constant word_max_idx : natural := state_sz / word_sz - 1;
constant half_word_max_idx : natural := state_sz / half_word_sz - 1;
constant key_sz : natural := 128;
constant nonce_sz : natural := 128;
subtype word_state_ty is word_vector ( 0 to word_max_idx );
constant a_idx : natural := 0;
constant b_idx : natural := 1;
constant c_idx : natural := 2;
constant d_idx : natural := 3;
constant e_idx : natural := 4;
subtype half_word_data is half_word_vector ( 0 to 1 );
subtype half_word_state_ty is half_word_vector ( 0 to half_word_max_idx );
constant a0_idx : natural := 1;
constant a1_idx : natural := 0;
constant b0_idx : natural := 3;
constant b1_idx : natural := 2;
constant c0_idx : natural := 5;
constant c1_idx : natural := 4;
constant d0_idx : natural := 7;
constant d1_idx : natural := 6;
constant e0_idx : natural := 9;
constant e1_idx : natural := 8;
function b2x( b : boolean ) return std_logic;
function half_words_to_words( st : half_word_state_ty ) return word_state_ty;
function words_to_half_words( st : word_state_ty ) return half_word_state_ty;
-- mode
subtype mode_ty is std_logic_vector( 1 downto 0 ); -- top lvl input
constant encrypt_mode : mode_ty := ( 1 => '0', 0 => '0' );
constant decrypt_mode : mode_ty := ( 1 => '0', 0 => '1' );
constant absorb_mode : mode_ty := ( 1 => '1', 0 => '0' );
constant squeeze_mode : mode_ty := ( 1 => '1', 0 => '1' );
subtype domsep_ty is std_logic_vector( 1 downto 0 ); -- top lvl input
-- derived control (from counter and more)
subtype ace_ctl_ty is std_logic_vector( 7 downto 0 );
constant absorb_idx : natural := 0;
constant replace_idx : natural := 1;
constant output_idx : natural := 2;
constant endstep_idx : natural := 3;
constant permoff_idx : natural := 4;
constant squeeze_idx : natural := 5;
constant lfsr_c_reset_idx : natural := 6;
constant lfsr_c_en_idx : natural := 7;
-- extras cntl for load, init, fin, tag, sqeeze
subtype onehot_ty is std_logic_vector( 3 downto 0); -- extrs cntl for load, init, fin, tag, sqeeze
-- round and step counters
-- use last bit for end ACE perm - for o_ready
-- -> i_valid will reset the counter!
-- counter only runs if msb = 0
constant bits_counter : natural := 8;
subtype count_ty is unsigned( bits_counter - 1 downto 0 );
-- standard vhdl operators
-- function "sll"( a : half_word; n : natural ) return half_word;
function onehot_rotate (a : onehot_ty) return onehot_ty;
function vector_to_data ( st : half_word_data ) return word;
function data_to_vector ( st : word ) return half_word_data;
end package;
package body ace_pkg is
function onehot_rotate (a : onehot_ty)
return onehot_ty
variable z : onehot_ty;
z(onehot_ty'high downto 1) := a(onehot_ty'high - 1 downto 0);
z(0) := a(onehot_ty'high);
return z;
end function;
function b2x( b : boolean ) return std_logic is
if b then
return '1';
return '0';
end if;
end function;
-- standard vhdl operators cast to state
-- function "sll"( a : half_word; n : natural ) return half_word is
-- begin
-- return half_word( std_logic_vector( a ) sll n );
-- end function;
-- state functions
function half_words_to_words( st : half_word_state_ty ) ---- CHECK THIS!!!!!! PLEASE
return word_state_ty
variable i : natural;
variable z : word_state_ty;
main_loop : for i in 0 to word_max_idx loop
z(i)(0 to half_word_sz - 1) := st(2*i);
z(i)(half_word_sz to word_sz - 1) := st(2*i+1);
end loop;
return z;
end function;
function words_to_half_words( st : word_state_ty )
return half_word_state_ty
variable i : natural;
variable z : half_word_state_ty;
main_loop : for i in 0 to word_max_idx loop
z(2*i) := st(i)(0 to half_word_sz - 1);
z(2*i+1) := st(i)(half_word_sz to word_sz - 1);
end loop;
return z;
end function;
function data_to_vector( st : word )
return half_word_data
variable z : half_word_data;
z(0) := st(0 to half_word_sz - 1);
z(1) := st(half_word_sz to word_sz - 1);
return z;
end function;
function vector_to_data( st : half_word_data )
return word
variable z : word;
z(0 to half_word_sz - 1) := st(0);
z(half_word_sz to word_sz - 1) := st(1);
return z;
end function;
end package body;
if { $gui_mode } {
add wave clk
add wave reset
add wave i_mode
add wave i_dom_sep
add wave o_ready
add wave i_valid
add wave i_data
add wave i_padding
add wave o_valid
add wave o_data
if { $sim_mode eq "PROG_MODE" } then {
add wave -noupdate -divider -height 32 STUFF
add wave /uut/u_ctl/state
add wave /uut/u_ctl/o_ready
add wave /uut/u_ctl/i_valid
add wave /uut/u_dp/i_data
add wave /uut/u_ctl/o_valid
add wave /uut/u_dp/o_data
add wave -noupdate -divider -height 32 DP
add wave -radix binary /uut/u_dp/ctl_const
add wave /uut/u_dp/i_data
add wave /uut/u_dp/o_data
add wave -radix binary /uut/u_dp/ctl_const
add wave /uut/u_dp/lfsr_c_en
add wave /uut/u_dp/lfsr_c_reset
add wave /uut/u_dp/permoff
add wave /uut/u_dp/endstep
add wave /uut/u_dp/absorb
add wave /uut/u_dp/replace
add wave /uut/u_dp/output
add wave /uut/u_dp/dsxor
add wave /uut/u_dp/post_input
add wave /uut/u_dp/pre_round
add wave /uut/u_dp/post_round
add wave /uut/u_dp/post_xor
add wave /uut/u_dp/post_step_const
add wave /uut/u_dp/post_linear
add wave /uut/u_dp/ace_path
add wave /uut/u_dp/ace_state
add wave -noupdate -divider -height 32 CTL
add wave /uut/u_ctl/state
add wave -radix unsigned /uut/u_ctl/count
add wave /uut/u_ctl/i_valid
add wave /uut/u_ctl/o_valid
add wave /uut/u_ctl/o_ready
add wave -radix binary /uut/u_ctl/onehot
add wave /uut/u_ctl/lfsr_c_reset
add wave -radix binary /uut/u_ctl/i_mode
add wave -radix binary /uut/u_ctl/i_dom_sep
vcd file ace.vcd
vcd add /ace_tb/uut/*
vcd add -r *
vcd on
run -all
vcd checkpoint
vcd off
vcd flush
if { $gui_mode } {
wave zoom full
} else {
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use work.ace_pkg.all;
entity dp is
( clk : in std_logic
; reset : in std_logic
; i_mode : in mode_ty
; i_control : in ace_ctl_ty
; i_onehot : in onehot_ty
; i_dom_sep : in domsep_ty
; i_valid : in std_logic
; i_data : in word
; i_padding : in std_logic
; o_data : out word
end entity;
architecture rtl of dp is
signal permoff, endstep, squeeze,
absorb, replace, output,
lfsr_reset, lfsr_en : std_logic;
signal ace_state, post_input : half_word_state_ty;
signal pre_round, post_round,
post_xor, post_step_const,
post_linear, ace_path : word_state_ty;
signal dsxor : half_word;
signal i_data_vector, o_data_vector : half_word_data;
signal ctl_const : lfsr_c_output;
u_lfsr :
entity work.lfsr port map
( clk => clk
, reset => lfsr_reset
, lfsr_en => lfsr_en
, o_const => ctl_const
i_data_vector <= data_to_vector( i_data );
o_data <= vector_to_data( o_data_vector );
absorb <= i_control( absorb_idx );
replace <= i_control( replace_idx );
output <= i_control( output_idx );
endstep <= i_control( endstep_idx );
permoff <= i_control( permoff_idx );
squeeze <= i_control( squeeze_idx );
lfsr_reset <= i_control( lfsr_reset_idx );
lfsr_en <= i_control( lfsr_en_idx );
-- post input: do input and domain separator and replace
post_input( a1_idx ) <= ace_state( a1_idx );
post_input( a0_idx ) <= ace_state( a0_idx );
post_input( b0_idx ) <= ace_state( b0_idx );
post_input( b1_idx ) <= ace_state( b1_idx );
post_input( c0_idx ) <= ace_state( c0_idx );
post_input( c1_idx ) <= ace_state( c1_idx );
post_input( d0_idx ) <= ace_state( d0_idx );
post_input( d1_idx ) <= ace_state( d1_idx );
post_input( e1_idx ) <= ace_state( e1_idx );
dsxor( 0 to half_word_sz - 3 ) <= ( others => '0' );
dsxor( half_word_sz - 2) <= i_dom_sep(1);
dsxor( half_word_sz - 1) <= i_dom_sep(0);
post_input( e0_idx ) <= dsxor xor ace_state( e0_idx ) when (i_valid = '1')
else ace_state( e0_idx );
o_data_vector(0) <= ace_state(a1_idx);
-- sb 64 ==> post round
pre_round <= half_words_to_words( post_input );
a_sb_64 :
entity work.sb_64 port map
( i_state => pre_round( a_idx )
, i_rc => ctl_const( lfsr_c_sz + 2 ) --rc0
, o_state => post_round( a_idx )
post_round( b_idx ) <= pre_round( b_idx );
c_sb_64 :
entity work.sb_64 port map
( i_state => pre_round( c_idx )
, i_rc => ctl_const( lfsr_c_sz + 1 ) --rc1
, o_state => post_round( c_idx )
post_round( d_idx ) <= pre_round( d_idx );
e_sb_64 :
entity work.sb_64 port map
( i_state => pre_round( e_idx )
, i_rc => ctl_const( lfsr_c_sz ) -- rc2
, o_state => post_round( e_idx )
-- XORs to the left ==> post xor
post_xor( a_idx ) <= post_round( a_idx );
post_xor( c_idx ) <= post_round( c_idx );
post_xor( b_idx ) <= post_round( b_idx ) xor post_round( c_idx );
post_xor( d_idx ) <= post_round( d_idx ) xor post_round( e_idx );
post_xor( e_idx ) <= post_round( e_idx ) xor post_round( a_idx );
-- XOR with step constant ==> post step const
post_step_const( a_idx ) <= post_xor( a_idx );
post_step_const( c_idx ) <= post_xor( c_idx );
post_step_const( b_idx)( 0 to 55) <= not post_xor( b_idx )( 0 to 55 );
post_step_const( b_idx)( 56 to 63) <= post_xor( b_idx )( 56 to 63 ) xor ctl_const( 2 to lfsr_c_sz + 2 ); -- sc0
post_step_const( d_idx)( 0 to 55) <= not post_xor( d_idx )( 0 to 55 );
post_step_const( d_idx)( 56 to 63) <= post_xor( d_idx )( 56 to 63 ) xor ctl_const( 1 to lfsr_c_sz + 1 ); -- sc1
post_step_const( e_idx)( 0 to 55) <= not post_xor( e_idx )( 0 to 55 );
post_step_const( e_idx)( 56 to 63) <= post_xor( e_idx )( 56 to 63 ) xor ctl_const( 0 to lfsr_c_sz ); --sc2
-- post liear layer pi = (3,2,0,4,1) ==> post linear
post_linear( a_idx ) <= post_step_const( d_idx );
post_linear( b_idx ) <= post_step_const( c_idx );
post_linear( c_idx ) <= post_step_const( a_idx );
post_linear( d_idx ) <= post_step_const( e_idx );
post_linear( e_idx ) <= post_step_const( b_idx );
-- update state
ace_path( a_idx ) <= post_linear( a_idx ) when endstep = '1' else post_round( a_idx );
ace_path( b_idx ) <= post_linear( b_idx ) when endstep = '1' else post_round( b_idx );
ace_path( c_idx ) <= post_linear( c_idx ) when endstep = '1' else post_round( c_idx );
ace_path( d_idx ) <= post_linear( d_idx ) when endstep = '1' else post_round( d_idx );
ace_path( e_idx ) <= post_linear( e_idx ) when endstep = '1' else post_round( e_idx );
wait until rising_edge( clk );
ace_state <= words_to_half_words(ace_path);
end process;
end architecture;
library ieee;
use ieee.std_logic_1164.all;
use work.ace_pkg.all;
entity lfsr_c is
( clk : in std_logic
; lfsr_c_en : in std_logic
; lfsr_c_reset : in std_logic
; o_const : out lfsr_c_output
end lfsr_c;
architecture rtl of lfsr_c is
signal sa: std_logic_vector(lfsr_c_sz - 1 downto 0);
signal xa: std_logic_vector(lfsr_c_sz + 2 downto 0);
-- 10 output bits for the constants
o_const <= xa; -- "to" type <= "downto" type. Index flip intended
-- just rename signal
xa(lfsr_c_sz-1 downto 0) <= sa(lfsr_c_sz-1 downto 0);
-- for updates and outputs
xa(lfsr_c_sz + 2 downto lfsr_c_sz) <= xa(3 downto 1) xor xa(2 downto 0);
lfsr_shift: for i in lfsr_c_sz-1 downto 0 generate
lfsr_step: process(clk) begin
if rising_edge(clk) then
if lfsr_c_reset ='1' then
sa(i) <= '1';
elsif lfsr_c_en ='1' then
sa(i) <= xa(i+3);
end if;
end if;
end process;
end generate lfsr_shift;
------------ ACE readme file ---------------
----- list of files for ACE synthesis: -----
ace_pkg.vhd -- main package
sb_64.vhd -- s-box with simeck
lfsr.vhd -- lfsr for step / round constant generation
ctl.vhd -- control (FSM)
dp.vhd -- datapath
ace.vhd -- top level entity declaration
ace-rtl.vhd -- top level architecture
----- additional files for simulation: -----
util_unsynth.vhd -- functions used in TB (general purpose)
ace_unsynth.vhd -- specific ACE functions and procedures used in TB
ace_tb.vhd -- ACE testbench
-------------- pure datapath ---------------
dp_pure.vhd -- datapath with most input/output multiplexers removed
----------- TB info (ace_tb.vhd): ----------
EDH is a 3-bit constant used to select which modes to test
"100" - encryption only
"010" - decyption only
"001" - hash only
"110" - encryption and decryption
stim_file_path -- stimulus file
output_file_path -- output file
------------ stimulus file format --------------
1 file = 1 set of Key, Nonce, AD, Plaintext and Ciphertext
K 00111122335588DD00111122335588DD <--- 128 bits of Key (all 128 bits in a single line)
N 111122335588DD00111122335588DD00 <--- 128 bits of Nonce (all 128 bits in a single line)
A 1122335588DD00111122335588DD00 <--- from 4 to 128 bits of AD
P 335588DD00111122335588DD001111 <--- from 4 to 128 bits of Plaintext
C F9362385DC213A07CEFEF38C34CEFF <--- from 4 to 128 bits of Ciphertext
--- padding is done by testbench
--- multiple lines for AD, Plaintext and Ciphertext are supported
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use work.ace_pkg.all;
entity sb_64 is
( i_state : in word
; i_rc : in std_logic
; o_state : out word
end entity;
architecture rtl of sb_64 is
signal x0, x1, z0, z1 : half_word;
signal rc : half_word;
x1 <= i_state( 0 to half_word_sz-1 );
x0 <= i_state( half_word_sz to word_sz - 1 );
rc <= ( 0 to half_word_sz - 2 => '1', half_word_sz - 1 => i_rc );
z0 <= x1;
z1 <= ( ( x1(5 to half_word_sz - 1) & x1 (0 to 4) ) and x1)
xor ( x1(1 to half_word_sz - 1) & x1 (0) )
xor x0
xor rc;
o_state <= z1 & z0;
end architecture;
NISTGCCFLAGS =-std=c99 -Wall -Wextra -Wshadow -fsanitize=address,undefined -O2
all: acehash256_1
acehash256: genkat_hash.c hash.c ace.c
$(CC) $(NISTGCCFLAGS) -o $@ $^ $(LFLAGS)
.PHONY: clean
-rm acehash256
/* Reference implementation of the ACE permutation
Written by:
Kalikinkar Mandal <>
#include "ace.h"
static const unsigned char SC0[16]={0x50,0x5c,0x91,0x8d,0x53,0x60,0x68,0xe1,0xf6,0x9d,0x40,0x4f,0xbe,0x5b,0xe9,0x7f}; //Step constants (SC_{2i})
static const unsigned char SC1[16]={0x28,0xae,0x48,0xc6,0xa9,0x30,0x34,0x70,0x7b,0xce,0x20,0x27,0x5f,0xad,0x74,0x3f}; //Step constants (SC_{2i+1})
static const unsigned char SC2[16]={0x14,0x57,0x24,0x63,0x54,0x18,0x9a,0x38,0xbd,0x67,0x10,0x13,0x2f,0xd6,0xba,0x1f}; //Step constants (SC_{2i+2})
static const unsigned char RC0[16]={0x07,0x0a,0x9b,0xe0,0xd1,0x1a,0x22,0xf7,0x62,0x96,0x71,0xaa,0x2b,0xe9,0xcf,0xb7};//Round constants (RC_{2i})
static const unsigned char RC1[16]={0x53,0x5d,0x49,0x7f,0xbe,0x1d,0x28,0x6c,0x82,0x47,0x6b,0x88,0xdc,0x8b,0x59,0xc6};//Round constants (RC_{2i+1})
static const unsigned char RC2[16]={0x43,0xe4,0x5e,0xcc,0x32,0x4e,0x75,0x25,0xfd,0xf9,0x76,0xa0,0xb0,0x09,0x1e,0xad};//Round constants (RC_{2i+2})
unsigned char rotl8 ( const unsigned char x, const unsigned char y, const unsigned char shift )
return ((x<<shift)|(y>>(8-shift)));
******* ACE permutation implementation********************
void ace_print_state( const unsigned char *state )
unsigned char i;
for ( i = 0; i < STATEBYTES; i++ )
printf("%.2x ", state[i]);
void ace_print_data(const uint8_t *x, const uint32_t xlen )
uint32_t j;
for ( j = 0; j < xlen; j++ )
printf("%.2x ", x[j]);
void simeck64_box( unsigned char *output, const unsigned char *input, const unsigned char rc )
unsigned char i, t;
unsigned char *tmp_shift_1, *tmp_shift_5, *tmp_pt;
tmp_shift_1 = (unsigned char *)malloc(4*sizeof(unsigned char));
tmp_shift_5 = (unsigned char *)malloc(4*sizeof(unsigned char));
tmp_pt = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
for ( i = 0; i < SIMECKBYTES; i++ )
tmp_pt[i] = input[i];
for ( i = 0; i < SIMECKROUND; i++ )
tmp_shift_1[0] = rotl8(tmp_pt[0], tmp_pt[1],1);
tmp_shift_1[1] = rotl8(tmp_pt[1], tmp_pt[2],1);
tmp_shift_1[2] = rotl8(tmp_pt[2], tmp_pt[3],1);
tmp_shift_1[3] = rotl8(tmp_pt[3], tmp_pt[0],1);
tmp_shift_5[0] = rotl8(tmp_pt[0], tmp_pt[1],5);
tmp_shift_5[1] = rotl8(tmp_pt[1], tmp_pt[2],5);
tmp_shift_5[2] = rotl8(tmp_pt[2], tmp_pt[3],5);
tmp_shift_5[3] = rotl8(tmp_pt[3], tmp_pt[0],5);
tmp_shift_5[0] = tmp_shift_5[0]&tmp_pt[0];
tmp_shift_5[1] = tmp_shift_5[1]&tmp_pt[1];
tmp_shift_5[2] = tmp_shift_5[2]&tmp_pt[2];
tmp_shift_5[3] = tmp_shift_5[3]&tmp_pt[3];
tmp_shift_1[0] = tmp_shift_1[0]^tmp_shift_5[0];
tmp_shift_1[1] = tmp_shift_1[1]^tmp_shift_5[1];
tmp_shift_1[2] = tmp_shift_1[2]^tmp_shift_5[2];
tmp_shift_1[3] = tmp_shift_1[3]^tmp_shift_5[3];
tmp_shift_1[0] = tmp_shift_1[0]^tmp_pt[4]^(0xff);
tmp_shift_1[1] = tmp_shift_1[1]^tmp_pt[5]^(0xff);
tmp_shift_1[2] = tmp_shift_1[2]^tmp_pt[6]^(0xff);
tmp_shift_1[3] = tmp_shift_1[3]^tmp_pt[7]^(0xfe);
t = (rc >> i)&1;
tmp_shift_1[3] = tmp_shift_1[3]^t;
tmp_pt[4] = tmp_pt[0];
tmp_pt[5] = tmp_pt[1];
tmp_pt[6] = tmp_pt[2];
tmp_pt[7] = tmp_pt[3];
tmp_pt[0] = tmp_shift_1[0];
tmp_pt[1] = tmp_shift_1[1];
tmp_pt[2] = tmp_shift_1[2];
tmp_pt[3] = tmp_shift_1[3];
//simeck_print_data(tmp_pt, 8);
for ( i = 0; i < SIMECKBYTES; i++ )
output[i] = tmp_pt[i];
void ace_permutation( unsigned char *input )
unsigned char i, j;
unsigned char *tmp_inp, *tmp_a, *tmp_c, *tmp_e;
tmp_inp = (unsigned char *)malloc(STATEBYTES*sizeof(unsigned char));
tmp_a = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
tmp_c = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
tmp_e = (unsigned char *)malloc(SIMECKBYTES*sizeof(unsigned char));
for ( i = 0; i < STATEBYTES; i++ )
tmp_inp[i] = input[i];
for ( i = 0; i < NUMSTEPS; i++ )
//A block
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_a[j] = tmp_inp[j];
simeck64_box( tmp_a, tmp_a, RC0[i] );
//C block
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_c[j] = tmp_inp[2*SIMECKBYTES+j];
simeck64_box( tmp_c, tmp_c, RC1[i] );
//E block
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_e[j] = tmp_inp[4*SIMECKBYTES+j];
simeck64_box( tmp_e, tmp_e, RC2[i] );
// Update A: A <= SC_{3i+1}+D+F(E)
for ( j = 0; j < SIMECKBYTES-1; j++ )
tmp_inp[j] = tmp_inp[3*SIMECKBYTES+j]^tmp_e[j]^(0xff);
tmp_inp[SIMECKBYTES-1] = tmp_inp[4*SIMECKBYTES-1]^tmp_e[SIMECKBYTES-1]^SC1[i];
// Update E: E <= SC_{3i}+B+F(C)
for ( j = 0; j < SIMECKBYTES-1; j++ )
tmp_inp[4*SIMECKBYTES+j] = tmp_inp[SIMECKBYTES+j]^tmp_c[j]^(0xff);
tmp_inp[5*SIMECKBYTES-1] = tmp_inp[2*SIMECKBYTES-1]^tmp_c[SIMECKBYTES-1]^SC0[i];
// Update B: B <= F(C)
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_inp[SIMECKBYTES+j] = tmp_c[j];
// Update C: C <= F(A)
for ( j = 0; j < SIMECKBYTES; j++ )
tmp_inp[2*SIMECKBYTES+j] = tmp_a[j];
// Update D: D <= SC_{3i+2}+F(A)+F(E)
for ( j = 0; j < SIMECKBYTES-1; j++ )
tmp_inp[3*SIMECKBYTES+j] = tmp_a[j]^tmp_e[j]^(0xff);
tmp_inp[4*SIMECKBYTES-1] = tmp_a[SIMECKBYTES-1]^tmp_e[SIMECKBYTES-1]^SC2[i];
//ace_print_state256(tmp_pt); // Printing intermediate state
for ( i = 0; i < STATEBYTES; i++ )
input[i] = tmp_inp[i];
void ace_permutation_ALLZERO ( unsigned char *state )
unsigned char i;
for ( i = 0; i < STATEBYTES; i++ )
state[i] = 0x0;
void ace_permutation_ALLONE ( unsigned char *state )
unsigned char i;
for ( i = 0; i < STATEBYTES; i++ )
state[i] = 0xff;
//ace_print_state( state );
/* Reference implementation of the ACE permutation
Written by:
Kalikinkar Mandal <>
#ifndef ACE_H
#define ACE_H
#define STATEBYTES 40 //Number OF BYTES = 320/8 = 40
#define SIMECKBYTES 8 //Number of Simeck BYTES = 64/8 = 8
#define SIMECKROUND 8 //Number of rounds
#define NUMSTEPS 16 //Number of steps
typedef unsigned long long u64;
unsigned char rotl8 ( const unsigned char x, const unsigned char y, const unsigned char shift );
void ace_print_data(const unsigned char *x, const uint32_t xlen );
void simeck_print_data(const unsigned char *y, const unsigned char ylen );
void simeck64_box( unsigned char *output, const unsigned char *input, const unsigned char rc );
void ace_permutation( unsigned char *input );
void ace_print_state( const unsigned char *state );
void ace_permutation_ALLZERO ( unsigned char *state );
void ace_permutation_ALLONE ( unsigned char *state );
int acehash_init( unsigned char *state );
int crypto_hash(
unsigned char *out,
const unsigned char *in,
unsigned long long inlen
// disable deprecation for sprintf and fopen
#ifdef _MSC_VER
#include <stdio.h>
#include <string.h>
#include "crypto_hash.h"
#include "api.h"
#define KAT_SUCCESS 0
#define KAT_DATA_ERROR -3
#define MAX_FILE_NAME 256
void init_buffer(unsigned char *buffer, unsigned long long numbytes);
void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length);
int generate_test_vectors();
int main()
int ret = generate_test_vectors();
if (ret != KAT_SUCCESS) {
fprintf(stderr, "test vector generation failed with code %d\n", ret);
return ret;
int generate_test_vectors()
FILE *fp;
char fileName[MAX_FILE_NAME];
unsigned char msg[MAX_MESSAGE_LENGTH];
unsigned char digest[CRYPTO_BYTES];
int ret_val = KAT_SUCCESS;
int count = 1;
init_buffer(msg, sizeof(msg));
sprintf(fileName, "../LWC_HASH_KAT_%d.txt", (CRYPTO_BYTES * 8));
if ((fp = fopen(fileName, "w")) == NULL) {
fprintf(stderr, "Couldn't open <%s> for write\n", fileName);
for (unsigned long long mlen = 0; mlen <= MAX_MESSAGE_LENGTH; mlen++) {
fprintf(fp, "Count = %d\n", count++);
fprint_bstr(fp, "Msg = ", msg, mlen);
ret_val = crypto_hash(digest, msg, mlen);
if(ret_val != 0) {
fprintf(fp, "crypto_hash returned <%d>\n", ret_val);
fprint_bstr(fp, "MD = ", digest, CRYPTO_BYTES);
fprintf(fp, "\n");
return ret_val;
void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length)
fprintf(fp, "%s", label);
for (unsigned long long i = 0; i < length; i++)
fprintf(fp, "%02X", data[i]);
fprintf(fp, "\n");
void init_buffer(unsigned char *buffer, unsigned long long numbytes)
for (unsigned long long i = 0; i < numbytes; i++)
buffer[i] = (unsigned char)i;
/* Reference Implementation of ACE-Hash256
Written by:
Kalikinkar Mandal <>
#include "ace.h"
#include "crypto_hash.h"
#include "api.h"
#define KAT_SUCCESS 0
#define KAT_DATA_ERROR -3
const unsigned char rate_bytes[8] = {0,1,2,3,16,17,18,19};
int acehash_init( unsigned char *state )
unsigned char i;
//Initialize the state to all-ZERO
for ( i = 0; i < STATEBYTES; i++ )
state[i] = 0x0;
if ( CRYPTO_BYTES == 32 )
//Initialize state with IV 0x804040
//According to specification: B[7] = 0x80; B[6] = 0x40; B[5] = 0x40;
state[8] = 0x80;
state[9] = 0x40;
state[10] = 0x40;
int crypto_hash(
unsigned char *out,
const unsigned char *in,
unsigned long long inlen
unsigned char *state;
unsigned char i, lblen;
//int func_ret;
u64 j, in64len;
in64len = inlen/8;
lblen = (unsigned char)(inlen%8);
state = (unsigned char *)malloc(sizeof(unsigned char)*STATEBYTES);
//Initialize state with predefined IV.
if ( acehash_init(state)!= KAT_SUCCESS )
//Absorbing phase: Rate Bytes A[0],A[1],A[2],A[3],C[0],C[1],C[2],C[3]
if ( inlen != 0 )
for ( j = 0; j < in64len; j++ )
for ( i = 0; i < 8; i++ )
if ( lblen != 0 )
//Encrypting the padded 64-bit block when "mlen" is not a multiple of 8
for ( i = 0; i < lblen; i++ )
state[rate_bytes[i]]^= in[in64len*8+(u64)i];
state[rate_bytes[lblen]]^=(0x80); //Padding: 10*
state[rate_bytes[0]]^=(0x80); //Padding: 10*
state[rate_bytes[0]]^=(0x80); //Padding: 10*
//Squeezing phase
if ( CRYPTO_BYTES == 32 )
for ( i = 0; i < 8; i++ )
out[i] = state[rate_bytes[i]];
for ( i = 0; i < 8; i++ )
out[i+8] = state[rate_bytes[i]];
for ( i = 0; i < 8; i++ )
out[i+16] = state[rate_bytes[i]];
for ( i = 0; i < 8; i++ )
out[i+24] = state[rate_bytes[i]];
This source diff could not be displayed because it is too large. You can view the blob instead.
#define CRYPTO_ABYTES 16
#ifndef ENDIAN_H_
#define ENDIAN_H_
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// macros for big endian machines
#define U64BIG(x) (x)
#define U32BIG(x) (x)
#define U16BIG(x) (x)
#elif defined(_MSC_VER) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
// macros for little endian machines
#define U64BIG(x) \
((((x) & 0x00000000000000FFULL) << 56) | (((x) & 0x000000000000FF00ULL) << 40) | \
(((x) & 0x0000000000FF0000ULL) << 24) | (((x) & 0x00000000FF000000ULL) << 8) | \
(((x) & 0x000000FF00000000ULL) >> 8) | (((x) & 0x0000FF0000000000ULL) >> 24) | \
(((x) & 0x00FF000000000000ULL) >> 40) | (((x) & 0xFF00000000000000ULL) >> 56))
#define U32BIG(x) \
((((x) & 0x000000FF) << 24) | (((x) & 0x0000FF00) << 8) | \
(((x) & 0x00FF0000) >> 8) | (((x) & 0xFF000000) >> 24))
#define U16BIG(x) \
((((x) & 0x00FF) << 8) | (((x) & 0xFF00) >> 8))
#error "ascon byte order macros not defined in endian.h"
#endif // ENDIAN_H_
#define CRYPTO_ABYTES 16
#include "api.h"
#include "endian.h"
#include "permutations.h"
#define RATE (128 / 8)
#define PA_ROUNDS 12
#define PB_ROUNDS 8
#define IV \
((u64)(8 * (CRYPTO_KEYBYTES)) << 56 | (u64)(8 * (RATE)) << 48 | \
(u64)(PA_ROUNDS) << 40 | (u64)(PB_ROUNDS) << 32)
int crypto_aead_decrypt(unsigned char* m, unsigned long long* mlen,
unsigned char* nsec, const unsigned char* c,
unsigned long long clen, const unsigned char* ad,
unsigned long long adlen, const unsigned char* npub,
const unsigned char* k) {
if (clen < CRYPTO_ABYTES) {
*mlen = 0;
return -1;
u32_2 K0, K1, N0, N1;
u32_2 x0, x1, x2, x3, x4;
u32_2 t0, t1, t2, t3, t4;
u64 tmp0, tmp1;
u32 i;
// set plaintext size
*mlen = clen - CRYPTO_ABYTES;
to_bit_interleaving(K0, U64BIG(*(u64*)k));
to_bit_interleaving(K1, U64BIG(*(u64*)(k + 8)));
to_bit_interleaving(N0, U64BIG(*(u64*)npub));
to_bit_interleaving(N1, U64BIG(*(u64*)(npub + 8)));
// initialization
to_bit_interleaving(x0, IV);
x1.o = K0.o;
x1.e = K0.e;
x2.e = K1.e;
x2.o = K1.o;
x3.e = N0.e;
x3.o = N0.o;
x4.e = N1.e;
x4.o = N1.o;
x3.e ^= K0.e;
x3.o ^= K0.o;
x4.e ^= K1.e;
x4.o ^= K1.o;
// process associated data
if (adlen) {
while (adlen >= RATE) {
to_bit_interleaving(t0, U64BIG(*(u64*)ad));
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, U64BIG(*(u64*)(ad + 8)));
x1.e ^= t1.e;
x1.o ^= t1.o;
adlen -= RATE;
ad += RATE;
tmp0 = 0;
tmp1 = 0;
for (i = 0; i < adlen; ++i, ++ad)
if (i < 8)
tmp0 ^= INS_BYTE64(*ad, i);
tmp1 ^= INS_BYTE64(*ad, i % 8);
if (adlen < 8)
tmp0 ^= INS_BYTE64(0x80, adlen);
tmp1 ^= INS_BYTE64(0x80, adlen % 8);
to_bit_interleaving(t0, tmp0);
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, tmp1);
x1.e ^= t1.e;
x1.o ^= t1.o;
x4.e ^= 1;
// process plaintext
while (clen >= RATE) {
from_bit_interleaving(tmp0, x0);
from_bit_interleaving(tmp1, x1);
*(u64*)m = U64BIG(tmp0) ^ *(u64*)c;
*(u64*)(m + 8) = U64BIG(tmp1) ^ *(u64*)(c + 8);
to_bit_interleaving(x0, U64BIG(*(u64*)c));
to_bit_interleaving(x1, U64BIG(*(u64*)(c + 8)));
clen -= RATE;
m += RATE;
c += RATE;
from_bit_interleaving(tmp0, x0);
from_bit_interleaving(tmp1, x1);
for (i = 0; i < clen; ++i, ++m, ++c) {
if (i < 8) {
*m = EXT_BYTE64(tmp0, i) ^ *c;
tmp0 &= ~INS_BYTE64(0xff, i);
tmp0 |= INS_BYTE64(*c, i);
} else {
*m = EXT_BYTE64(tmp1, i % 8) ^ *c;
tmp1 &= ~INS_BYTE64(0xff, i % 8);
tmp1 |= INS_BYTE64(*c, i % 8);
if (clen < 8)
tmp0 ^= INS_BYTE64(0x80, clen);
tmp1 ^= INS_BYTE64(0x80, clen % 8);
to_bit_interleaving(x0, tmp0);
to_bit_interleaving(x1, tmp1);
// finalization
x2.e ^= K0.e;
x2.o ^= K0.o;
x3.e ^= K1.e;
x3.o ^= K1.o;
x3.e ^= K0.e;
x3.o ^= K0.o;
x4.e ^= K1.e;
x4.o ^= K1.o;
// verify tag
from_bit_interleaving(tmp0, x3);
from_bit_interleaving(tmp1, x4);
if (*(u64*)c != U64BIG(tmp0) || *(u64*)(c + 8) != U64BIG(tmp1)) {
*mlen = 0;
return -1;
return 0;
#include "api.h"
#include "endian.h"
#include "permutations.h"
#define RATE (128 / 8)
#define PA_ROUNDS 12
#define PB_ROUNDS 8
#define IV \
((u64)(8 * (CRYPTO_KEYBYTES)) << 56 | (u64)(8 * (RATE)) << 48 | \
(u64)(PA_ROUNDS) << 40 | (u64)(PB_ROUNDS) << 32)
int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
const unsigned char* m, unsigned long long mlen,
const unsigned char* ad, unsigned long long adlen,
const unsigned char* nsec, const unsigned char* npub,
const unsigned char* k) {
u32_2 K0, K1, N0, N1;
u32_2 x0, x1, x2, x3, x4;
u32_2 t0, t1, t2, t3, t4;
u64 tmp0, tmp1;
u32 i;
// set ciphertext size
*clen = mlen + CRYPTO_ABYTES;
// load key and nonce
to_bit_interleaving(K0, U64BIG(*(u64*)k));
to_bit_interleaving(K1, U64BIG(*(u64*)(k + 8)));
to_bit_interleaving(N0, U64BIG(*(u64*)npub));
to_bit_interleaving(N1, U64BIG(*(u64*)(npub + 8)));
// initialization
to_bit_interleaving(x0, IV);
x1.o = K0.o;
x1.e = K0.e;
x2.e = K1.e;
x2.o = K1.o;
x3.e = N0.e;
x3.o = N0.o;
x4.e = N1.e;
x4.o = N1.o;
x3.e ^= K0.e;
x3.o ^= K0.o;
x4.e ^= K1.e;
x4.o ^= K1.o;
// process associated data
if (adlen) {
while (adlen >= RATE) {
to_bit_interleaving(t0, U64BIG(*(u64*)ad));
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, U64BIG(*(u64*)(ad + 8)));
x1.e ^= t1.e;
x1.o ^= t1.o;
adlen -= RATE;
ad += RATE;
tmp0 = 0;
tmp1 = 0;
for (i = 0; i < adlen; ++i, ++ad)
if (i < 8)
tmp0 ^= INS_BYTE64(*ad, i);
tmp1 ^= INS_BYTE64(*ad, i % 8);
if (adlen < 8)
tmp0 ^= INS_BYTE64(0x80, adlen);
tmp1 ^= INS_BYTE64(0x80, adlen % 8);
to_bit_interleaving(t0, tmp0);
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, tmp1);
x1.e ^= t1.e;
x1.o ^= t1.o;
x4.e ^= 1;
// process plaintext
while (mlen >= RATE) {
to_bit_interleaving(t0, U64BIG(*(u64*)m));
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, U64BIG(*(u64*)(m + 8)));
x1.e ^= t1.e;
x1.o ^= t1.o;
from_bit_interleaving(tmp0, x0);
*(u64*)c = U64BIG(tmp0);
from_bit_interleaving(tmp1, x1);
*(u64*)(c + 8) = U64BIG(tmp1);
mlen -= RATE;
m += RATE;
c += RATE;
tmp0 = 0;
tmp1 = 0;
for (i = 0; i < mlen; ++i, ++m)
if (i < 8)
tmp0 ^= INS_BYTE64(*m, i);
tmp1 ^= INS_BYTE64(*m, i % 8);
if (mlen < 8)
tmp0 ^= INS_BYTE64(0x80, mlen);
tmp1 ^= INS_BYTE64(0x80, mlen % 8);
to_bit_interleaving(t0, tmp0);
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, tmp1);
x1.e ^= t1.e;
x1.o ^= t1.o;
from_bit_interleaving(tmp0, x0);
from_bit_interleaving(tmp1, x1);
for (i = 0; i < mlen; ++i, ++c)
if (i < 8)
*c = EXT_BYTE64(tmp0, i);
*c = EXT_BYTE64(tmp1, i % 8);
// finalization
x2.e ^= K0.e;
x2.o ^= K0.o;
x3.e ^= K1.e;
x3.o ^= K1.o;
x3.e ^= K0.e;
x3.o ^= K0.o;
x4.e ^= K1.e;
x4.o ^= K1.o;
// set tag
from_bit_interleaving(tmp0, x3);
*(u64*)c = U64BIG(tmp0);
from_bit_interleaving(tmp1, x4);
*(u64*)(c + 8) = U64BIG(tmp1);
return 0;
#ifndef ENDIAN_H_
#define ENDIAN_H_
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// macros for big endian machines
#define U64BIG(x) (x)
#define U32BIG(x) (x)
#define U16BIG(x) (x)
#elif defined(_MSC_VER) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
// macros for little endian machines
#define U64BIG(x) \
((((x) & 0x00000000000000FFULL) << 56) | (((x) & 0x000000000000FF00ULL) << 40) | \
(((x) & 0x0000000000FF0000ULL) << 24) | (((x) & 0x00000000FF000000ULL) << 8) | \
(((x) & 0x000000FF00000000ULL) >> 8) | (((x) & 0x0000FF0000000000ULL) >> 24) | \
(((x) & 0x00FF000000000000ULL) >> 40) | (((x) & 0xFF00000000000000ULL) >> 56))
#define U32BIG(x) \
((((x) & 0x000000FF) << 24) | (((x) & 0x0000FF00) << 8) | \
(((x) & 0x00FF0000) >> 8) | (((x) & 0xFF000000) >> 24))
#define U16BIG(x) \
((((x) & 0x00FF) << 8) | (((x) & 0xFF00) >> 8))
#error "ascon byte order macros not defined in endian.h"
#endif // ENDIAN_H_
#include "endian.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
typedef struct {
u32 e;
u32 o;
} u32_2;
#define EXT_BYTE64(x, n) ((u8)((u64)(x) >> (8 * (7 - (n)))))
#define INS_BYTE64(x, n) ((u64)(x) << (8 * (7 - (n))))
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
// Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
#define to_bit_interleaving(out, in) \
do { \
u32 hi = (in) >> 32; \
u32 lo = (u32)(in); \
u32 r0, r1; \
r0 = (lo ^ (lo >> 1)) & 0x22222222, lo ^= r0 ^ (r0 << 1); \
r0 = (lo ^ (lo >> 2)) & 0x0C0C0C0C, lo ^= r0 ^ (r0 << 2); \
r0 = (lo ^ (lo >> 4)) & 0x00F000F0, lo ^= r0 ^ (r0 << 4); \
r0 = (lo ^ (lo >> 8)) & 0x0000FF00, lo ^= r0 ^ (r0 << 8); \
r1 = (hi ^ (hi >> 1)) & 0x22222222, hi ^= r1 ^ (r1 << 1); \
r1 = (hi ^ (hi >> 2)) & 0x0C0C0C0C, hi ^= r1 ^ (r1 << 2); \
r1 = (hi ^ (hi >> 4)) & 0x00F000F0, hi ^= r1 ^ (r1 << 4); \
r1 = (hi ^ (hi >> 8)) & 0x0000FF00, hi ^= r1 ^ (r1 << 8); \
(out).e = (lo & 0x0000FFFF) | (hi << 16); \
(out).o = (lo >> 16) | (hi & 0xFFFF0000); \
} while (0)
// Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
#define from_bit_interleaving(out, in) \
do { \
u32 lo = ((in).e & 0x0000FFFF) | ((in).o << 16); \
u32 hi = ((in).e >> 16) | ((in).o & 0xFFFF0000); \
u32 r0, r1; \
r0 = (lo ^ (lo >> 8)) & 0x0000FF00, lo ^= r0 ^ (r0 << 8); \
r0 = (lo ^ (lo >> 4)) & 0x00F000F0, lo ^= r0 ^ (r0 << 4); \
r0 = (lo ^ (lo >> 2)) & 0x0C0C0C0C, lo ^= r0 ^ (r0 << 2); \
r0 = (lo ^ (lo >> 1)) & 0x22222222, lo ^= r0 ^ (r0 << 1); \
r1 = (hi ^ (hi >> 8)) & 0x0000FF00, hi ^= r1 ^ (r1 << 8); \
r1 = (hi ^ (hi >> 4)) & 0x00F000F0, hi ^= r1 ^ (r1 << 4); \
r1 = (hi ^ (hi >> 2)) & 0x0C0C0C0C, hi ^= r1 ^ (r1 << 2); \
r1 = (hi ^ (hi >> 1)) & 0x22222222, hi ^= r1 ^ (r1 << 1); \
out = (u64)hi << 32 | lo; \
} while (0)
#define ROUND(C_e, C_o) \
do { \
/* round constant */ \
x2.e ^= C_e; x2.o ^= C_o; \
/* s-box layer */ \
x0.e ^= x4.e; x0.o ^= x4.o; \
x4.e ^= x3.e; x4.o ^= x3.o; \
x2.e ^= x1.e; x2.o ^= x1.o; \
t0.e = x0.e; t0.o = x0.o; \
t4.e = x4.e; t4.o = x4.o; \
t3.e = x3.e; t3.o = x3.o; \
t1.e = x1.e; t1.o = x1.o; \
t2.e = x2.e; t2.o = x2.o; \
x0.e = t0.e ^ (~t1.e & t2.e); x0.o = t0.o ^ (~t1.o & t2.o); \
x2.e = t2.e ^ (~t3.e & t4.e); x2.o = t2.o ^ (~t3.o & t4.o); \
x4.e = t4.e ^ (~t0.e & t1.e); x4.o = t4.o ^ (~t0.o & t1.o); \
x1.e = t1.e ^ (~t2.e & t3.e); x1.o = t1.o ^ (~t2.o & t3.o); \
x3.e = t3.e ^ (~t4.e & t0.e); x3.o = t3.o ^ (~t4.o & t0.o); \
x1.e ^= x0.e; x1.o ^= x0.o; \
x3.e ^= x2.e; x3.o ^= x2.o; \
x0.e ^= x4.e; x0.o ^= x4.o; \
/* linear layer */ \
t0.e = x0.e ^ ROTR32(x0.o, 4); t0.o = x0.o ^ ROTR32(x0.e, 5); \
t1.e = x1.e ^ ROTR32(x1.e, 11); t1.o = x1.o ^ ROTR32(x1.o, 11); \
t2.e = x2.e ^ ROTR32(x2.o, 2); t2.o = x2.o ^ ROTR32(x2.e, 3); \
t3.e = x3.e ^ ROTR32(x3.o, 3); t3.o = x3.o ^ ROTR32(x3.e, 4); \
t4.e = x4.e ^ ROTR32(x4.e, 17); t4.o = x4.o ^ ROTR32(x4.o, 17); \
x0.e ^= ROTR32(t0.o, 9); x0.o ^= ROTR32(t0.e, 10); \
x1.e ^= ROTR32(t1.o, 19); x1.o ^= ROTR32(t1.e, 20); \
x2.e ^= t2.o; x2.o ^= ROTR32(t2.e, 1); \
x3.e ^= ROTR32(t3.e, 5); x3.o ^= ROTR32(t3.o, 5); \
x4.e ^= ROTR32(t4.o, 3); x4.o ^= ROTR32(t4.e, 4); \
x2.e = ~x2.e; x2.o = ~x2.o; \
} while(0)
#define P12() \
do { \
ROUND(0xc, 0xc); \
ROUND(0x9, 0xc); \
ROUND(0xc, 0x9); \
ROUND(0x9, 0x9); \
ROUND(0x6, 0xc); \
ROUND(0x3, 0xc); \
ROUND(0x6, 0x9); \
ROUND(0x3, 0x9); \
ROUND(0xc, 0x6); \
ROUND(0x9, 0x6); \
ROUND(0xc, 0x3); \
ROUND(0x9, 0x3); \
} while (0)
#define P8() \
do { \
ROUND(0x6, 0xc); \
ROUND(0x3, 0xc); \
ROUND(0x6, 0x9); \
ROUND(0x3, 0x9); \
ROUND(0xc, 0x6); \
ROUND(0x9, 0x6); \
ROUND(0xc, 0x3); \
ROUND(0x9, 0x3); \
} while (0)
#define P6() \
do { \
ROUND(0x6, 0x9); \
ROUND(0x3, 0x9); \
ROUND(0xc, 0x6); \
ROUND(0x9, 0x6); \
ROUND(0xc, 0x3); \
ROUND(0x9, 0x3); \
} while (0)
#define CRYPTO_ABYTES 16
#include "api.h"
#include "endian.h"
#include "permutations.h"
#define RATE (128 / 8)
#define PA_ROUNDS 12
#define PB_ROUNDS 8
#define IV \
((u64)(8 * (CRYPTO_KEYBYTES)) << 56 | (u64)(8 * (RATE)) << 48 | \
(u64)(PA_ROUNDS) << 40 | (u64)(PB_ROUNDS) << 32)
int crypto_aead_decrypt(unsigned char* m, unsigned long long* mlen,
unsigned char* nsec, const unsigned char* c,
unsigned long long clen, const unsigned char* ad,
unsigned long long adlen, const unsigned char* npub,
const unsigned char* k) {
if (clen < CRYPTO_ABYTES) {
*mlen = 0;
return -1;
u32_2 K0, K1, N0, N1;
u32_2 x0, x1, x2, x3, x4;
u32_2 t0, t1;
u64 tmp0, tmp1;
u32 i;
// set plaintext size
*mlen = clen - CRYPTO_ABYTES;
to_bit_interleaving(K0, U64BIG(*(u64*)k));
to_bit_interleaving(K1, U64BIG(*(u64*)(k + 8)));
to_bit_interleaving(N0, U64BIG(*(u64*)npub));
to_bit_interleaving(N1, U64BIG(*(u64*)(npub + 8)));
// initialization
to_bit_interleaving(x0, IV);
x1.o = K0.o;
x1.e = K0.e;
x2.e = K1.e;
x2.o = K1.o;
x3.e = N0.e;
x3.o = N0.o;
x4.e = N1.e;
x4.o = N1.o;
x3.e ^= K0.e;
x3.o ^= K0.o;
x4.e ^= K1.e;
x4.o ^= K1.o;
// process associated data
if (adlen) {
while (adlen >= RATE) {
to_bit_interleaving(t0, U64BIG(*(u64*)ad));
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, U64BIG(*(u64*)(ad + 8)));
x1.e ^= t1.e;
x1.o ^= t1.o;
adlen -= RATE;
ad += RATE;
tmp0 = 0;
tmp1 = 0;
for (i = 0; i < adlen; ++i, ++ad)
if (i < 8)
tmp0 ^= INS_BYTE64(*ad, i);
tmp1 ^= INS_BYTE64(*ad, i % 8);
if (adlen < 8)
tmp0 ^= INS_BYTE64(0x80, adlen);
tmp1 ^= INS_BYTE64(0x80, adlen % 8);
to_bit_interleaving(t0, tmp0);
x0.e ^= t0.e;
x0.o ^= t0.o;
to_bit_interleaving(t1, tmp1);
x1.e ^= t1.e;
x1.o ^= t1.o;
x4.e ^= 1;
// process plaintext
while (clen >= RATE) {
from_bit_interleaving(tmp0, x0);
from_bit_interleaving(tmp1, x1);
*(u64*)m = U64BIG(tmp0) ^ *(u64*)c;
*(u64*)(m + 8) = U64BIG(tmp1) ^ *(u64*)(c + 8);
to_bit_interleaving(x0, U64BIG(*(u64*)c));
to_bit_interleaving(x1, U64BIG(*(u64*)(c + 8)));
clen -= RATE;
m += RATE;
c += RATE;
from_bit_interleaving(tmp0, x0);
from_bit_interleaving(tmp1, x1);
for (i = 0; i < clen; ++i, ++m, ++c) {
if (i < 8) {
*m = EXT_BYTE64(tmp0, i) ^ *c;
tmp0 &= ~INS_BYTE64(0xff, i);
tmp0 |= INS_BYTE64(*c, i);
} else {
*m = EXT_BYTE64(tmp1, i % 8) ^ *c;
tmp1 &= ~INS_BYTE64(0xff, i % 8);
tmp1 |= INS_BYTE64(*c, i % 8);
if (clen < 8)
tmp0 ^= INS_BYTE64(0x80, clen);
tmp1 ^= INS_BYTE64(0x80, clen % 8);
to_bit_interleaving(x0, tmp0);
to_bit_interleaving(x1, tmp1);
// finalization
x2.e ^= K0.e;
x2.o ^= K0.o;
x3.e ^= K1.e;
x3.o ^= K1.o;
x3.e ^= K0.e;
x3.o ^= K0.o;
x4.e ^= K1.e;
x4.o ^= K1.o;
// verify tag
from_bit_interleaving(tmp0, x3);
from_bit_interleaving(tmp1, x4);
if (*(u64*)c != U64BIG(tmp0) || *(u64*)(c + 8) != U64BIG(tmp1)) {
*mlen = 0;
return -1;
return 0;
