Commit 40fde2ff by Alexandre Adomnicai Committed by Enrico Pozzobon

skinny & romulus

parent 9fb00266
This source diff could not be displayed because it is too large. You can view the blob instead.
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 40
extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
******************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 1
******************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 2
******************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 3
******************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/******************************************************************************
* Encryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
#define SKINNY128_384_ROUNDS 40
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
mixcolumns_0(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
mixcolumns_1(state); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
mixcolumns_2(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
mixcolumns_3(state); \
})
#endif // SKINNY128_H_
\ No newline at end of file
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#endif // TK_SCHEDULE_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_128_ROUNDS 40
#define SKINNY128_256_ROUNDS 48
#define SKINNY128_384_ROUNDS 56
extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>